From 420a17542db56281505145ff1e3036274b09da61 Mon Sep 17 00:00:00 2001 From: rafly lesmana Date: Mon, 20 Nov 2023 08:34:48 +0700 Subject: [PATCH] fix: Make YoutubeLoader support on demand language translation (#13583) **Description:** Enhance the functionality of YoutubeLoader to enable the translation of available transcripts by refining the existing logic. **Issue:** Encountering a problem with YoutubeLoader (#13523) where the translation feature is not functioning as expected. Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: Bagatur --- .../document_loaders/youtube_transcript.ipynb | 2 +- libs/langchain/langchain/document_loaders/youtube.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/docs/integrations/document_loaders/youtube_transcript.ipynb b/docs/docs/integrations/document_loaders/youtube_transcript.ipynb index 0e106f0110..86ee8d220e 100644 --- a/docs/docs/integrations/document_loaders/youtube_transcript.ipynb +++ b/docs/docs/integrations/document_loaders/youtube_transcript.ipynb @@ -99,7 +99,7 @@ "\n", "Language param : It's a list of language codes in a descending priority, `en` by default.\n", "\n", - "translation param : It's a translate preference when the youtube does'nt have your select language, `en` by default." + "translation param : It's a translate preference, you can translate available transcript to your preferred language." ] }, { diff --git a/libs/langchain/langchain/document_loaders/youtube.py b/libs/langchain/langchain/document_loaders/youtube.py index 54cd35af21..f91adff611 100644 --- a/libs/langchain/langchain/document_loaders/youtube.py +++ b/libs/langchain/langchain/document_loaders/youtube.py @@ -146,7 +146,7 @@ class YoutubeLoader(BaseLoader): video_id: str, add_video_info: bool = False, language: Union[str, Sequence[str]] = "en", - translation: str = "en", + translation: Optional[str] = None, continue_on_failure: bool = False, ): """Initialize with YouTube video ID.""" @@ -206,8 +206,10 @@ class YoutubeLoader(BaseLoader): try: transcript = transcript_list.find_transcript(self.language) except NoTranscriptFound: - en_transcript = transcript_list.find_transcript(["en"]) - transcript = en_transcript.translate(self.translation) + transcript = transcript_list.find_transcript(["en"]) + + if self.translation is not None: + transcript = transcript.translate(self.translation) transcript_pieces = transcript.fetch()