You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/parsers/audio.py

57 lines
1.9 KiB
Python

from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
class OpenAIWhisperParser(BaseBlobParser):
"""Transcribe and parse audio files.
Audio transcription is with OpenAI Whisper model."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import io
try:
import openai
except ImportError:
raise ValueError(
"openai package not found, please install it with "
"`pip install openai`"
)
try:
from pydub import AudioSegment
except ImportError:
raise ValueError(
"pydub package not found, please install it with " "`pip install pydub`"
)
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
# Define the duration of each chunk in minutes
# Need to meet 25MB size limit for Whisper API
chunk_duration = 20
chunk_duration_ms = chunk_duration * 60 * 1000
# Split the audio into chunk_duration_ms chunks
for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
# Audio chunk
chunk = audio[i : i + chunk_duration_ms]
file_obj = io.BytesIO(chunk.export(format="mp3").read())
if blob.source is not None:
file_obj.name = blob.source + f"_part_{split_number}.mp3"
else:
file_obj.name = f"part_{split_number}.mp3"
# Transcribe
print(f"Transcribing part {split_number+1}!")
transcript = openai.Audio.transcribe("whisper-1", file_obj)
yield Document(
page_content=transcript.text,
metadata={"source": blob.source, "chunk": split_number},
)