Create OpenAIWhisperParser for generating Documents from audio files (#5580)

# OpenAIWhisperParser

This PR creates a new parser, `OpenAIWhisperParser`, that uses the
[OpenAI Whisper
model](https://platform.openai.com/docs/guides/speech-to-text/quickstart)
to perform transcription of audio files to text (`Documents`). Please
see the notebook for usage.
This commit is contained in:
Lance Martin 2023-06-05 15:51:13 -07:00 committed by GitHub
parent a4c9053d40
commit aea090045b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 122 additions and 0 deletions

View File

@ -30,6 +30,7 @@ For detailed instructions on how to get set up with Unstructured, see installati
:maxdepth: 1
:glob:
./document_loaders/examples/audio.ipynb
./document_loaders/examples/conll-u.ipynb
./document_loaders/examples/copypaste.ipynb
./document_loaders/examples/csv.ipynb

File diff suppressed because one or more lines are too long

View File

@ -1,3 +1,4 @@
from langchain.document_loaders.parsers.audio import OpenAIWhisperParser
from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
@ -9,6 +10,7 @@ from langchain.document_loaders.parsers.pdf import (
__all__ = [
"BS4HTMLParser",
"OpenAIWhisperParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",

View File

@ -0,0 +1,21 @@
from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
class OpenAIWhisperParser(BaseBlobParser):
"""Transcribe and parse audio files.
Audio transcription is with OpenAI Whisper model."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import openai
with blob.as_bytes_io() as f:
transcript = openai.Audio.transcribe("whisper-1", f)
yield Document(
page_content=transcript.text, metadata={"source": blob.source}
)

View File

@ -5,6 +5,7 @@ def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"BS4HTMLParser",
"OpenAIWhisperParser",
"PyPDFParser",
"PDFMinerParser",
"PyMuPDFParser",