From d480330faee3c27d42c491d09a4af4cbb688cf74 Mon Sep 17 00:00:00 2001 From: Satoru Sakamoto <51464932+satoru814@users.noreply.github.com> Date: Fri, 24 Feb 2023 00:32:46 +0900 Subject: [PATCH] fix to specific language transcript (#1231) Currently youtube loader only seems to support English audio. Changed to load videos in the specified language. --- langchain/document_loaders/youtube.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index aa191e32..b3e0fd25 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -10,10 +10,13 @@ from langchain.document_loaders.base import BaseLoader class YoutubeLoader(BaseLoader): """Loader that loads Youtube transcripts.""" - def __init__(self, video_id: str, add_video_info: bool = False): + def __init__( + self, video_id: str, add_video_info: bool = False, language: str = "en" + ): """Initialize with YouTube video ID.""" self.video_id = video_id self.add_video_info = add_video_info + self.language = language @classmethod def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader: @@ -39,7 +42,9 @@ class YoutubeLoader(BaseLoader): video_info = self._get_video_info() metadata.update(video_info) - transcript_pieces = YouTubeTranscriptApi.get_transcript(self.video_id) + transcript_pieces = YouTubeTranscriptApi.get_transcript( + self.video_id, languages=(self.language,) + ) transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces]) return [Document(page_content=transcript, metadata=metadata)]