Improve video_id extraction in YoutubeLoader (#4452)

# Improve video_id extraction in `YoutubeLoader` `YoutubeLoader.from_youtube_url` can only deal with one specific url format. I've introduced `YoutubeLoader.extract_video_id` which can extract video id from common YT urls. Fixes #4451 @eyurtsev --------- Co-authored-by: Kamil Niski <kamil.niski@gmail.com>
2023-05-15 16:45:19 +02:00 · 2023-05-15 16:45:19 +02:00 · c2761aa8f4
commit c2761aa8f4
parent 8b42e8a510
2 changed files with 62 additions and 1 deletions
--- a/langchain/document_loaders/youtube.py
+++ b/langchain/document_loaders/youtube.py
@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import re
 from pydantic import root_validator
 from pydantic.dataclasses import dataclass
@ -96,6 +97,34 @@ class GoogleApiClient:
        return creds
 YT_URL_RE = re.compile(
    r"""(?x)^
     (
         (?:https?://|//)                                    # http(s):// or protocol-independent URL
         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
         (?:                                                  # the various things that can precede the ID:
             (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
             |shorts/
             |(?:                                             # or the v= param in all its forms
                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
                 v=
             )
         ))
         |(?:
            youtu\.be|                                        # just youtu.be/xxxx
            vid\.plus|                                        # or vid.plus/xxxx
         )/
         )
     )?                                                       # all until now is optional -> you can pass the naked ID
     (?P<id>[0-9A-Za-z_-]{11})                                # here is it! the YouTube video ID
     (?(1).+)?                                                # if we found the ID, everything can follow
     $"""
 )
 class YoutubeLoader(BaseLoader):
    """Loader that loads Youtube transcripts."""
@ -113,10 +142,18 @@ class YoutubeLoader(BaseLoader):
        self.language = language
        self.continue_on_failure = continue_on_failure
    @staticmethod
    def extract_video_id(youtube_url: str) -> str:
        """Extract video id from common YT urls."""
        match = YT_URL_RE.match(youtube_url)
        if not match:
            raise ValueError(f"Could not determine the video ID for the URL {youtube_url}")
        return match.group("id")
    @classmethod
    def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
        """Given youtube URL, load video."""
-        video_id = youtube_url.split("youtube.com/watch?v=")[-1]
+        video_id = cls.extract_video_id(youtube_url)
        return cls(video_id, **kwargs)
    def load(self) -> List[Document]:
--- a/tests/unit_tests/document_loader/test_youtube.py
+++ b/tests/unit_tests/document_loader/test_youtube.py
@ -0,0 +1,24 @@
 from langchain.document_loaders import YoutubeLoader
 import pytest
@pytest.mark.parametrize(
    "youtube_url, expected_video_id",
    [
        ("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
        ("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
        ("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
        ("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
        ("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
        ("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
        ("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
        ("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
        ("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
        ("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
        ("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
        ("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
        ("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
    ],
 )
 def test_video_id_extraction(youtube_url: str, expected_video_id: str):
    assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id