forked from Archives/langchain
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
"""Loader that loads YouTube transcript."""
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, List
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.document_loaders.base import BaseLoader
|
|
|
|
|
|
class YoutubeLoader(BaseLoader):
|
|
"""Loader that loads Youtube transcripts."""
|
|
|
|
def __init__(
|
|
self, video_id: str, add_video_info: bool = False, language: str = "en"
|
|
):
|
|
"""Initialize with YouTube video ID."""
|
|
self.video_id = video_id
|
|
self.add_video_info = add_video_info
|
|
self.language = language
|
|
|
|
@classmethod
|
|
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
|
|
"""Parse out video id from YouTube url."""
|
|
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
|
|
return cls(video_id, **kwargs)
|
|
|
|
def load(self) -> List[Document]:
|
|
"""Load documents."""
|
|
try:
|
|
from youtube_transcript_api import YouTubeTranscriptApi
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import youtube_transcript_api python package. "
|
|
"Please it install it with `pip install youtube-transcript-api`."
|
|
)
|
|
|
|
metadata = {"source": self.video_id}
|
|
|
|
if self.add_video_info:
|
|
# Get more video meta info
|
|
# Such as title, description, thumbnail url, publish_date
|
|
video_info = self._get_video_info()
|
|
metadata.update(video_info)
|
|
|
|
transcript_pieces = YouTubeTranscriptApi.get_transcript(
|
|
self.video_id, languages=(self.language,)
|
|
)
|
|
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
|
|
|
|
return [Document(page_content=transcript, metadata=metadata)]
|
|
|
|
def _get_video_info(self) -> dict:
|
|
"""Get important video information.
|
|
|
|
Components are:
|
|
- title
|
|
- description
|
|
- thumbnail url,
|
|
- publish_date
|
|
- channel_author
|
|
- and more.
|
|
"""
|
|
try:
|
|
from pytube import YouTube
|
|
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import pytube python package. "
|
|
"Please it install it with `pip install pytube`."
|
|
)
|
|
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
|
|
video_info = {
|
|
"title": yt.title,
|
|
"description": yt.description,
|
|
"view_count": yt.views,
|
|
"thumbnail_url": yt.thumbnail_url,
|
|
"publish_date": yt.publish_date,
|
|
"length": yt.length,
|
|
"author": yt.author,
|
|
}
|
|
return video_info
|