Harrison/youtube multi language (#5758)

Co-authored-by: rafly lesmana <raflylesmana111@gmail.com>
This commit is contained in:
Harrison Chase 2023-06-05 16:38:07 -07:00 committed by GitHub
parent 2dcda8a8ac
commit 25487fa5ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 7 deletions

View File

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "df770c72",
"metadata": {},
@ -55,11 +56,12 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6b278a1b",
"metadata": {},
"source": [
"## Add video info"
"### Add video info"
]
},
{
@ -79,20 +81,36 @@
"metadata": {},
"outputs": [],
"source": [
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)"
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)\n",
"loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "fc417e31",
"metadata": {},
"source": [
"### Add language preferences\n",
"\n",
"Language param : It's a list of language codes in a descending priority, `en` by default.\n",
"\n",
"translation param : It's a translate preference when the youtube does'nt have your select language, `en` by default."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97b98e92",
"id": "08510625",
"metadata": {},
"outputs": [],
"source": [
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True, language=['en','id'], translation='en')\n",
"loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "65796cc5",
"metadata": {},

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse
from pydantic import root_validator
@ -146,13 +146,19 @@ class YoutubeLoader(BaseLoader):
self,
video_id: str,
add_video_info: bool = False,
language: str = "en",
language: Union[str, Sequence[str]] = "en",
translation: str = "en",
continue_on_failure: bool = False,
):
"""Initialize with YouTube video ID."""
self.video_id = video_id
self.add_video_info = add_video_info
self.language = language
if isinstance(language, str):
self.language = [language]
else:
self.language = language
self.translation = translation
self.continue_on_failure = continue_on_failure
@staticmethod
@ -199,10 +205,10 @@ class YoutubeLoader(BaseLoader):
return []
try:
transcript = transcript_list.find_transcript([self.language])
transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.language)
transcript = en_transcript.translate(self.translation)
transcript_pieces = transcript.fetch()