Harrison/youtube multi language (#5758)

Co-authored-by: rafly lesmana <raflylesmana111@gmail.com>
This commit is contained in:
Harrison Chase 2023-06-05 16:38:07 -07:00 committed by GitHub
parent 2dcda8a8ac
commit 25487fa5ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 7 deletions

View File

@ -1,6 +1,7 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "df770c72", "id": "df770c72",
"metadata": {}, "metadata": {},
@ -55,11 +56,12 @@
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "6b278a1b", "id": "6b278a1b",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Add video info" "### Add video info"
] ]
}, },
{ {
@ -79,20 +81,36 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)" "loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True)\n",
"loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "fc417e31",
"metadata": {},
"source": [
"### Add language preferences\n",
"\n",
"Language param : It's a list of language codes in a descending priority, `en` by default.\n",
"\n",
"translation param : It's a translate preference when the youtube does'nt have your select language, `en` by default."
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "97b98e92", "id": "08510625",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"loader = YoutubeLoader.from_youtube_url(\"https://www.youtube.com/watch?v=QsYGlZkevEg\", add_video_info=True, language=['en','id'], translation='en')\n",
"loader.load()" "loader.load()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "65796cc5", "id": "65796cc5",
"metadata": {}, "metadata": {},

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import logging import logging
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional, Sequence, Union
from urllib.parse import parse_qs, urlparse from urllib.parse import parse_qs, urlparse
from pydantic import root_validator from pydantic import root_validator
@ -146,13 +146,19 @@ class YoutubeLoader(BaseLoader):
self, self,
video_id: str, video_id: str,
add_video_info: bool = False, add_video_info: bool = False,
language: str = "en", language: Union[str, Sequence[str]] = "en",
translation: str = "en",
continue_on_failure: bool = False, continue_on_failure: bool = False,
): ):
"""Initialize with YouTube video ID.""" """Initialize with YouTube video ID."""
self.video_id = video_id self.video_id = video_id
self.add_video_info = add_video_info self.add_video_info = add_video_info
self.language = language self.language = language
if isinstance(language, str):
self.language = [language]
else:
self.language = language
self.translation = translation
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
@staticmethod @staticmethod
@ -199,10 +205,10 @@ class YoutubeLoader(BaseLoader):
return [] return []
try: try:
transcript = transcript_list.find_transcript([self.language]) transcript = transcript_list.find_transcript(self.language)
except NoTranscriptFound: except NoTranscriptFound:
en_transcript = transcript_list.find_transcript(["en"]) en_transcript = transcript_list.find_transcript(["en"])
transcript = en_transcript.translate(self.language) transcript = en_transcript.translate(self.translation)
transcript_pieces = transcript.fetch() transcript_pieces = transcript.fetch()