From c2761aa8f4266e97037aa25480b3c8e26e7417f3 Mon Sep 17 00:00:00 2001 From: KNiski Date: Mon, 15 May 2023 16:45:19 +0200 Subject: [PATCH] Improve video_id extraction in YoutubeLoader (#4452) # Improve video_id extraction in `YoutubeLoader` `YoutubeLoader.from_youtube_url` can only deal with one specific url format. I've introduced `YoutubeLoader.extract_video_id` which can extract video id from common YT urls. Fixes #4451 @eyurtsev --------- Co-authored-by: Kamil Niski --- langchain/document_loaders/youtube.py | 39 ++++++++++++++++++- .../document_loader/test_youtube.py | 24 ++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index 4f586576..41c64212 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -4,6 +4,7 @@ from __future__ import annotations import logging from pathlib import Path from typing import Any, Dict, List, Optional +import re from pydantic import root_validator from pydantic.dataclasses import dataclass @@ -96,6 +97,34 @@ class GoogleApiClient: return creds +YT_URL_RE = re.compile( + r"""(?x)^ + ( + (?:https?://|//) # http(s):// or protocol-independent URL + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com| + youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ + |shorts/ + |(?: # or the v= param in all its forms + (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY) + v= + ) + )) + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus| # or vid.plus/xxxx + )/ + ) + )? # all until now is optional -> you can pass the naked ID + (?P[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" +) + class YoutubeLoader(BaseLoader): """Loader that loads Youtube transcripts.""" @@ -113,10 +142,18 @@ class YoutubeLoader(BaseLoader): self.language = language self.continue_on_failure = continue_on_failure + @staticmethod + def extract_video_id(youtube_url: str) -> str: + """Extract video id from common YT urls.""" + match = YT_URL_RE.match(youtube_url) + if not match: + raise ValueError(f"Could not determine the video ID for the URL {youtube_url}") + return match.group("id") + @classmethod def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: """Given youtube URL, load video.""" - video_id = youtube_url.split("youtube.com/watch?v=")[-1] + video_id = cls.extract_video_id(youtube_url) return cls(video_id, **kwargs) def load(self) -> List[Document]: diff --git a/tests/unit_tests/document_loader/test_youtube.py b/tests/unit_tests/document_loader/test_youtube.py index e69de29b..933a54ff 100644 --- a/tests/unit_tests/document_loader/test_youtube.py +++ b/tests/unit_tests/document_loader/test_youtube.py @@ -0,0 +1,24 @@ +from langchain.document_loaders import YoutubeLoader +import pytest + + +@pytest.mark.parametrize( + "youtube_url, expected_video_id", + [ + ("http://www.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"), + ("http://youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"), + ("http://m.youtube.com/watch?v=-wtIMTCHWuI", "-wtIMTCHWuI"), + ("http://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"), + ("https://youtu.be/-wtIMTCHWuI", "-wtIMTCHWuI"), + ("https://www.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"), + ("https://m.youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"), + ("https://youtube.com/watch?v=lalOy8Mbfdc", "lalOy8Mbfdc"), + ("http://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"), + ("http://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"), + ("https://youtu.be/lalOy8Mbfdc?t=1", "lalOy8Mbfdc"), + ("http://www.youtube-nocookie.com/embed/lalOy8Mbfdc?rel=0", "lalOy8Mbfdc"), + ("https://youtu.be/lalOy8Mbfdc?t=1s", "lalOy8Mbfdc"), + ], +) +def test_video_id_extraction(youtube_url: str, expected_video_id: str): + assert YoutubeLoader.extract_video_id(youtube_url) == expected_video_id