Improve video_id extraction in YoutubeLoader (#4452)

# Improve video_id extraction in `YoutubeLoader`

`YoutubeLoader.from_youtube_url` can only deal with one specific url
format. I've introduced `YoutubeLoader.extract_video_id` which can
extract video id from common YT urls.

Fixes #4451 


@eyurtsev

---------

Co-authored-by: Kamil Niski <kamil.niski@gmail.com>
This commit is contained in:
KNiski 2023-05-15 16:45:19 +02:00 committed by GitHub
parent 8b42e8a510
commit c2761aa8f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 1 deletions

View File

@ -4,6 +4,7 @@ from __future__ import annotations
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import re
from pydantic import root_validator from pydantic import root_validator
from pydantic.dataclasses import dataclass from pydantic.dataclasses import dataclass
@ -96,6 +97,34 @@ class GoogleApiClient:
return creds return creds
YT_URL_RE = re.compile(
r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
|shorts/
|(?: # or the v= param in all its forms
(?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
v=
)
))
|(?:
youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx
)/
)
)? # all until now is optional -> you can pass the naked ID
(?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
$"""
)
class YoutubeLoader(BaseLoader): class YoutubeLoader(BaseLoader):
"""Loader that loads Youtube transcripts.""" """Loader that loads Youtube transcripts."""
@ -113,10 +142,18 @@ class YoutubeLoader(BaseLoader):
self.language = language self.language = language
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
@staticmethod
def extract_video_id(youtube_url: str) -> str:
"""Extract video id from common YT urls."""
match = YT_URL_RE.match(youtube_url)
if not match:
raise ValueError(f"Could not determine the video ID for the URL {youtube_url}")
return match.group("id")
@classmethod @classmethod
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
"""Given youtube URL, load video.""" """Given youtube URL, load video."""
video_id = youtube_url.split("youtube.com/watch?v=")[-1] video_id = cls.extract_video_id(youtube_url)
return cls(video_id, **kwargs) return cls(video_id, **kwargs)
def load(self) -> List[Document]: def load(self) -> List[Document]:

View File

@ -0,0 +1,24 @@
from langchain.document_loaders import YoutubeLoader
import pytest
@pytest.mark.parametrize(
"youtube_url, expected_video_id",
[
("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
],
)
def test_video_id_extraction(youtube_url: str, expected_video_id: str):
assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id