You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/blob_loaders/youtube_audio.py

51 lines
1.5 KiB
Python

from typing import Iterable, List
from langchain.document_loaders.blob_loaders import FileSystemBlobLoader
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader
class YoutubeAudioLoader(BlobLoader):
"""Load YouTube urls as audio file(s)."""
def __init__(self, urls: List[str], save_dir: str):
if not isinstance(urls, list):
raise TypeError("urls must be a list")
self.urls = urls
self.save_dir = save_dir
def yield_blobs(self) -> Iterable[Blob]:
"""Yield audio blobs for each url."""
try:
import yt_dlp
except ImportError:
raise ValueError(
"yt_dlp package not found, please install it with "
"`pip install yt_dlp`"
)
# Use yt_dlp to download audio given a YouTube url
ydl_opts = {
"format": "m4a/bestaudio/best",
"noplaylist": True,
"outtmpl": self.save_dir + "/%(title)s.%(ext)s",
"postprocessors": [
{
"key": "FFmpegExtractAudio",
"preferredcodec": "m4a",
}
],
}
for url in self.urls:
# Download file
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download(url)
# Yield the written blobs
loader = FileSystemBlobLoader(self.save_dir, glob="*.m4a")
for blob in loader.yield_blobs():
yield blob