|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
import logging
|
|
|
|
from pathlib import Path
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from typing import Any, Dict, List, Optional, Sequence, Union
|
|
|
|
from urllib.parse import parse_qs, urlparse
|
|
|
|
from urllib.parse import parse_qs, urlparse
|
|
|
|
|
|
|
|
|
|
|
|
from pydantic import root_validator
|
|
|
|
from pydantic import root_validator
|
|
|
@ -146,13 +146,19 @@ class YoutubeLoader(BaseLoader):
|
|
|
|
self,
|
|
|
|
self,
|
|
|
|
video_id: str,
|
|
|
|
video_id: str,
|
|
|
|
add_video_info: bool = False,
|
|
|
|
add_video_info: bool = False,
|
|
|
|
language: str = "en",
|
|
|
|
language: Union[str, Sequence[str]] = "en",
|
|
|
|
|
|
|
|
translation: str = "en",
|
|
|
|
continue_on_failure: bool = False,
|
|
|
|
continue_on_failure: bool = False,
|
|
|
|
):
|
|
|
|
):
|
|
|
|
"""Initialize with YouTube video ID."""
|
|
|
|
"""Initialize with YouTube video ID."""
|
|
|
|
self.video_id = video_id
|
|
|
|
self.video_id = video_id
|
|
|
|
self.add_video_info = add_video_info
|
|
|
|
self.add_video_info = add_video_info
|
|
|
|
self.language = language
|
|
|
|
self.language = language
|
|
|
|
|
|
|
|
if isinstance(language, str):
|
|
|
|
|
|
|
|
self.language = [language]
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
self.language = language
|
|
|
|
|
|
|
|
self.translation = translation
|
|
|
|
self.continue_on_failure = continue_on_failure
|
|
|
|
self.continue_on_failure = continue_on_failure
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
@staticmethod
|
|
|
@ -199,10 +205,10 @@ class YoutubeLoader(BaseLoader):
|
|
|
|
return []
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
transcript = transcript_list.find_transcript([self.language])
|
|
|
|
transcript = transcript_list.find_transcript(self.language)
|
|
|
|
except NoTranscriptFound:
|
|
|
|
except NoTranscriptFound:
|
|
|
|
en_transcript = transcript_list.find_transcript(["en"])
|
|
|
|
en_transcript = transcript_list.find_transcript(["en"])
|
|
|
|
transcript = en_transcript.translate(self.language)
|
|
|
|
transcript = en_transcript.translate(self.translation)
|
|
|
|
|
|
|
|
|
|
|
|
transcript_pieces = transcript.fetch()
|
|
|
|
transcript_pieces = transcript.fetch()
|
|
|
|
|
|
|
|
|
|
|
|