fix to specific language transcript (#1231)

Currently youtube loader only seems to support English audio. 
Changed to load videos in the specified language.
searx-search-suffix
Satoru Sakamoto 1 year ago committed by GitHub
parent 6085fe18d4
commit d480330fae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -10,10 +10,13 @@ from langchain.document_loaders.base import BaseLoader
class YoutubeLoader(BaseLoader):
"""Loader that loads Youtube transcripts."""
def __init__(self, video_id: str, add_video_info: bool = False):
def __init__(
self, video_id: str, add_video_info: bool = False, language: str = "en"
):
"""Initialize with YouTube video ID."""
self.video_id = video_id
self.add_video_info = add_video_info
self.language = language
@classmethod
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
@ -39,7 +42,9 @@ class YoutubeLoader(BaseLoader):
video_info = self._get_video_info()
metadata.update(video_info)
transcript_pieces = YouTubeTranscriptApi.get_transcript(self.video_id)
transcript_pieces = YouTubeTranscriptApi.get_transcript(
self.video_id, languages=(self.language,)
)
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
return [Document(page_content=transcript, metadata=metadata)]

Loading…
Cancel
Save