Improve video_id extraction in YoutubeLoader (#4452)

# Improve video_id extraction in `YoutubeLoader` `YoutubeLoader.from_youtube_url` can only deal with one specific url format. I've introduced `YoutubeLoader.extract_video_id` which can extract video id from common YT urls. Fixes #4451 @eyurtsev --------- Co-authored-by: Kamil Niski <kamil.niski@gmail.com>
1 year ago · c2761aa8f4
parent 8b42e8a510
commit c2761aa8f4
2 changed files with 62 additions and 1 deletions
--- a/langchain/document_loaders/youtube.py
+++ b/langchain/document_loaders/youtube.py
@ -4,6 +4,7 @@ from __future__ import annotations
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional
+import re

 from pydantic import root_validator
 from pydantic.dataclasses import dataclass
@ -96,6 +97,34 @@ class GoogleApiClient:

        return creds

+YT_URL_RE = re.compile(
+    r"""(?x)^
+     (
+         (?:https?://|//)                                    # http(s):// or protocol-independent URL
+         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
+            youtube\.googleapis\.com)/                        # the various hostnames, with wildcard subdomains
+         (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
+         (?:                                                  # the various things that can precede the ID:
+             (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
+             |shorts/
+             |(?:                                             # or the v= param in all its forms
+                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                 (?:\?|\#!?)                                  # the params delimiter ? or # or #!
+                 (?:.*?[&;])??                                # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&amp;v=V36LpHqtcDY)
+                 v=
+             )
+         ))
+         |(?:
+            youtu\.be|                                        # just youtu.be/xxxx
+            vid\.plus|                                        # or vid.plus/xxxx
+         )/
+         )
+     )?                                                       # all until now is optional -> you can pass the naked ID
+     (?P<id>[0-9A-Za-z_-]{11})                                # here is it! the YouTube video ID
+     (?(1).+)?                                                # if we found the ID, everything can follow
+     $"""
+)
+

 class YoutubeLoader(BaseLoader):
    """Loader that loads Youtube transcripts."""
@ -113,10 +142,18 @@ class YoutubeLoader(BaseLoader):
        self.language = language
        self.continue_on_failure = continue_on_failure

+    @staticmethod
+    def extract_video_id(youtube_url: str) -> str:
+        """Extract video id from common YT urls."""
+        match = YT_URL_RE.match(youtube_url)
+        if not match:
+            raise ValueError(f"Could not determine the video ID for the URL {youtube_url}")
+        return match.group("id")
+
    @classmethod
    def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
        """Given youtube URL, load video."""
-        video_id = youtube_url.split("youtube.com/watch?v=")[-1]
+        video_id = cls.extract_video_id(youtube_url)
        return cls(video_id, **kwargs)

    def load(self) -> List[Document]:
--- a/tests/unit_tests/document_loader/test_youtube.py
+++ b/tests/unit_tests/document_loader/test_youtube.py
@ -0,0 +1,24 @@
+from langchain.document_loaders import YoutubeLoader
+import pytest
+
+
+@pytest.mark.parametrize(
+    "youtube_url, expected_video_id",
+    [
+        ("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
+        ("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
+        ("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"),
+        ("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
+        ("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"),
+        ("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
+        ("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
+        ("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"),
+        ("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
+        ("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
+        ("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"),
+        ("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"),
+        ("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"),
+    ],
+)
+def test_video_id_extraction(youtube_url: str, expected_video_id: str):
+    assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id