YouTube Loader: Replace regexp with built-in parsing (#4729)

pull/4372/head^2
Eugene Yurtsev 1 year ago committed by GitHub
parent c70ae562b4
commit d3300bd799
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -4,7 +4,7 @@ from __future__ import annotations
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import re from urllib.parse import parse_qs, urlparse
from pydantic import root_validator from pydantic import root_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
@ -97,33 +97,46 @@ class GoogleApiClient:
return creds return creds
YT_URL_RE = re.compile(
r"""(?x)^ ALLOWED_SCHEMAS = {"http", "https"}
( ALLOWED_NETLOCK = {
(?:https?://|//) # http(s):// or protocol-independent URL "youtu.be",
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| "m.youtube.com",
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains "youtube.com",
(?:.*?\#/)? # handle anchor (#/) redirect urls "www.youtube.com",
(?: # the various things that can precede the ID: "www.youtube-nocookie.com",
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ "vid.plus",
|shorts/ }
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #! def _parse_video_id(url: str) -> Optional[str]:
(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) """Parse a youtube url and return the video id if valid, otherwise None."""
v= parsed_url = urlparse(url)
)
)) if parsed_url.scheme not in ALLOWED_SCHEMAS:
|(?: return None
youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx if parsed_url.netloc not in ALLOWED_NETLOCK:
)/ return None
)
)? # all until now is optional -> you can pass the naked ID path = parsed_url.path
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow if path.endswith("/watch"):
$""" query = parsed_url.query
) parsed_query = parse_qs(query)
if "v" in parsed_query:
ids = parsed_query["v"]
video_id = ids if isinstance(ids, str) else ids[0]
else:
return None
else:
path = parsed_url.path.lstrip("/")
video_id = path.split("/")[-1]
if len(video_id) != 11: # Video IDs are 11 characters long
return None
return video_id
class YoutubeLoader(BaseLoader): class YoutubeLoader(BaseLoader):
@ -145,10 +158,12 @@ class YoutubeLoader(BaseLoader):
@staticmethod @staticmethod
def extract_video_id(youtube_url: str) -> str: def extract_video_id(youtube_url: str) -> str:
"""Extract video id from common YT urls.""" """Extract video id from common YT urls."""
match = YT_URL_RE.match(youtube_url) video_id = _parse_video_id(youtube_url)
if not match: if not video_id:
raise ValueError(f"Could not determine the video ID for the URL {youtube_url}") raise ValueError(
return match.group("id") f"Could not determine the video ID for the URL {youtube_url}"
)
return video_id
@classmethod @classmethod
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:

@ -1,6 +1,7 @@
from langchain.document_loaders import YoutubeLoader
import pytest import pytest
from langchain.document_loaders import YoutubeLoader
@pytest.mark.parametrize( @pytest.mark.parametrize(
"youtube_url, expected_video_id", "youtube_url, expected_video_id",
@ -18,7 +19,9 @@ import pytest
("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"), ("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"), ("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"), ("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
("https://www.youtube.com/shorts/cd0Fy92_w_s", "cd0Fy92_w_s"),
], ],
) )
def test_video_id_extraction(youtube_url: str, expected_video_id: str): def test_video_id_extraction(youtube_url: str, expected_video_id: str) -> None:
"""Test that the video id is extracted from a youtube url"""
assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id

Loading…
Cancel
Save