forked from Archives/langchain
Improve video_id extraction in YoutubeLoader (#4452)
# Improve video_id extraction in `YoutubeLoader` `YoutubeLoader.from_youtube_url` can only deal with one specific url format. I've introduced `YoutubeLoader.extract_video_id` which can extract video id from common YT urls. Fixes #4451 @eyurtsev --------- Co-authored-by: Kamil Niski <kamil.niski@gmail.com>
This commit is contained in:
parent
8b42e8a510
commit
c2761aa8f4
@ -4,6 +4,7 @@ from __future__ import annotations
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
import re
|
||||||
|
|
||||||
from pydantic import root_validator
|
from pydantic import root_validator
|
||||||
from pydantic.dataclasses import dataclass
|
from pydantic.dataclasses import dataclass
|
||||||
@ -96,6 +97,34 @@ class GoogleApiClient:
|
|||||||
|
|
||||||
return creds
|
return creds
|
||||||
|
|
||||||
|
YT_URL_RE = re.compile(
|
||||||
|
r"""(?x)^
|
||||||
|
(
|
||||||
|
(?:https?://|//) # http(s):// or protocol-independent URL
|
||||||
|
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
|
||||||
|
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
|
||||||
|
(?:.*?\#/)? # handle anchor (#/) redirect urls
|
||||||
|
(?: # the various things that can precede the ID:
|
||||||
|
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
|
||||||
|
|shorts/
|
||||||
|
|(?: # or the v= param in all its forms
|
||||||
|
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
|
||||||
|
(?:\?|\#!?) # the params delimiter ? or # or #!
|
||||||
|
(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
|
||||||
|
v=
|
||||||
|
)
|
||||||
|
))
|
||||||
|
|(?:
|
||||||
|
youtu\.be| # just youtu.be/xxxx
|
||||||
|
vid\.plus| # or vid.plus/xxxx
|
||||||
|
)/
|
||||||
|
)
|
||||||
|
)? # all until now is optional -> you can pass the naked ID
|
||||||
|
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
|
||||||
|
(?(1).+)? # if we found the ID, everything can follow
|
||||||
|
$"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class YoutubeLoader(BaseLoader):
|
class YoutubeLoader(BaseLoader):
|
||||||
"""Loader that loads Youtube transcripts."""
|
"""Loader that loads Youtube transcripts."""
|
||||||
@ -113,10 +142,18 @@ class YoutubeLoader(BaseLoader):
|
|||||||
self.language = language
|
self.language = language
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_video_id(youtube_url: str) -> str:
|
||||||
|
"""Extract video id from common YT urls."""
|
||||||
|
match = YT_URL_RE.match(youtube_url)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Could not determine the video ID for the URL {youtube_url}")
|
||||||
|
return match.group("id")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
||||||
"""Given youtube URL, load video."""
|
"""Given youtube URL, load video."""
|
||||||
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
video_id = cls.extract_video_id(youtube_url)
|
||||||
return cls(video_id, **kwargs)
|
return cls(video_id, **kwargs)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
from langchain.document_loaders import YoutubeLoader
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"youtube_url, expected_video_id",
|
||||||
|
[
|
||||||
|
("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
||||||
|
("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
||||||
|
("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
|
||||||
|
("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
|
||||||
|
("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
|
||||||
|
("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
||||||
|
("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
||||||
|
("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
|
||||||
|
("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
|
||||||
|
("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
|
||||||
|
("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
|
||||||
|
("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
|
||||||
|
("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_video_id_extraction(youtube_url: str, expected_video_id: str):
|
||||||
|
assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id
|
Loading…
Reference in New Issue
Block a user