You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/youtube.py

82 lines
2.6 KiB
Python

"""Loader that loads YouTube transcript."""
from __future__ import annotations
from typing import Any, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class YoutubeLoader(BaseLoader):
"""Loader that loads Youtube transcripts."""
def __init__(
self, video_id: str, add_video_info: bool = False, language: str = "en"
):
"""Initialize with YouTube video ID."""
self.video_id = video_id
self.add_video_info = add_video_info
self.language = language
@classmethod
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
"""Parse out video id from YouTube url."""
video_id = youtube_url.split("youtube.com/watch?v=")[-1]
return cls(video_id, **kwargs)
def load(self) -> List[Document]:
"""Load documents."""
try:
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
raise ImportError(
"Could not import youtube_transcript_api python package. "
"Please it install it with `pip install youtube-transcript-api`."
)
metadata = {"source": self.video_id}
if self.add_video_info:
# Get more video meta info
# Such as title, description, thumbnail url, publish_date
video_info = self._get_video_info()
metadata.update(video_info)
transcript_pieces = YouTubeTranscriptApi.get_transcript(
self.video_id, languages=(self.language,)
)
transcript = " ".join([t["text"].strip(" ") for t in transcript_pieces])
return [Document(page_content=transcript, metadata=metadata)]
def _get_video_info(self) -> dict:
"""Get important video information.
Components are:
- title
- description
- thumbnail url,
- publish_date
- channel_author
- and more.
"""
try:
from pytube import YouTube
except ImportError:
raise ImportError(
"Could not import pytube python package. "
"Please it install it with `pip install pytube`."
)
yt = YouTube(f"https://www.youtube.com/watch?v={self.video_id}")
video_info = {
"title": yt.title,
"description": yt.description,
"view_count": yt.views,
"thumbnail_url": yt.thumbnail_url,
"publish_date": yt.publish_date,
"length": yt.length,
"author": yt.author,
}
return video_info