mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
community[patch]: youtube loader transcript format (#16625)
- **Description**: YoutubeLoader right now returns one document that contains the entire transcript. I think it would be useful to add an option to return multiple documents, where each document would contain one line of transcript with the start time and duration in the metadata. For example, [AssemblyAIAudioTranscriptLoader](https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/document_loaders/assemblyai.py) is implemented in a similar way, it allows you to choose between the format to use for the document loader.
This commit is contained in:
parent
a936472512
commit
4e189cd89a
@ -2,6 +2,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
@ -139,6 +140,11 @@ def _parse_video_id(url: str) -> Optional[str]:
|
||||
return video_id
|
||||
|
||||
|
||||
class TranscriptFormat(Enum):
|
||||
TEXT = "text"
|
||||
LINES = "lines"
|
||||
|
||||
|
||||
class YoutubeLoader(BaseLoader):
|
||||
"""Load `YouTube` transcripts."""
|
||||
|
||||
@ -148,6 +154,7 @@ class YoutubeLoader(BaseLoader):
|
||||
add_video_info: bool = False,
|
||||
language: Union[str, Sequence[str]] = "en",
|
||||
translation: Optional[str] = None,
|
||||
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
|
||||
continue_on_failure: bool = False,
|
||||
):
|
||||
"""Initialize with YouTube video ID."""
|
||||
@ -159,6 +166,7 @@ class YoutubeLoader(BaseLoader):
|
||||
else:
|
||||
self.language = language
|
||||
self.translation = translation
|
||||
self.transcript_format = transcript_format
|
||||
self.continue_on_failure = continue_on_failure
|
||||
|
||||
@staticmethod
|
||||
@ -214,9 +222,19 @@ class YoutubeLoader(BaseLoader):
|
||||
|
||||
transcript_pieces = transcript.fetch()
|
||||
|
||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
|
||||
return [Document(page_content=transcript, metadata=metadata)]
|
||||
if self.transcript_format == TranscriptFormat.TEXT:
|
||||
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
||||
return [Document(page_content=transcript, metadata=metadata)]
|
||||
elif self.transcript_format == TranscriptFormat.LINES:
|
||||
return [
|
||||
Document(
|
||||
page_content=t["text"].strip(" "),
|
||||
metadata=dict((key, t[key]) for key in t if key != "text"),
|
||||
)
|
||||
for t in transcript_pieces
|
||||
]
|
||||
else:
|
||||
raise ValueError("Unknown transcript format.")
|
||||
|
||||
def _get_video_info(self) -> dict:
|
||||
"""Get important video information.
|
||||
|
Loading…
Reference in New Issue
Block a user