community[minor]: Add audio-parser "faster-whisper" in audio.py (#20012)

faster-whisper is a reimplementation of OpenAI's Whisper model using
CTranslate2, which is up to 4 times faster than enai/whisper for the
same accuracy while using less memory. The efficiency can be further
improved with 8-bit quantization on both CPU and GPU.

It can automatically detect the following 14 languages and transcribe
the text into their respective languages: en, zh, fr, de, ja, ko, ru,
es, th, it, pt, vi, ar, tr.

The gitbub repository for faster-whisper is :
    https://github.com/SYSTRAN/faster-whisper

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
hulitaitai 2024-04-19 04:50:59 +08:00 committed by GitHub
parent e3c2431c5b
commit 7d0a008744
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -329,3 +329,135 @@ class YandexSTTParser(BaseBlobParser):
page_content=res.normalized_text,
metadata={"source": blob.source},
)
class FasterWhisperParser(BaseBlobParser):
"""Transcribe and parse audio files with faster-whisper.
faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2,
which is up to 4 times faster than openai/whisper for the same accuracy while using
less memory. The efficiency can be further improved with 8-bit quantization on both
CPU and GPU.
It can automatically detect the following 14 languages and transcribe the text
into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi,
ar, tr.
The gitbub repository for faster-whisper is :
https://github.com/SYSTRAN/faster-whisper
Example: Load a YouTube video and transcribe the video speech into a document.
.. code-block:: python
from langchain.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio
import FasterWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio
import YoutubeAudioLoader
url="https://www.youtube.com/watch?v=your_video"
save_dir="your_dir/"
loader = GenericLoader(
YoutubeAudioLoader([url],save_dir),
FasterWhisperParser()
)
docs = loader.load()
"""
def __init__(
self,
*,
device: Optional[str] = "cuda",
model_size: Optional[str] = None,
):
"""Initialize the parser.
Args:
device: It can be "cuda" or "cpu" based on the available device.
model_size: There are four model sizes to choose from: "base", "small",
"medium", and "large-v3", based on the available GPU memory.
"""
try:
import torch
except ImportError:
raise ImportError(
"torch package not found, please install it with `pip install torch`"
)
# Determine the device to use
if device == "cpu":
self.device = "cpu"
else:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# Determine the model_size
if self.device == "cpu":
self.model_size = "base"
else:
# Set the model_size based on the available memory
mem = torch.cuda.get_device_properties(self.device).total_memory / (1024**2)
if mem < 1000:
self.model_size = "base"
elif mem < 3000:
self.model_size = "small"
elif mem < 5000:
self.model_size = "medium"
else:
self.model_size = "large-v3"
# If the user has assigned a model size, then use the assigned size
if model_size is not None:
if model_size in ["base", "small", "medium", "large-v3"]:
self.model_size = model_size
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
import io
try:
from pydub import AudioSegment
except ImportError:
raise ImportError(
"pydub package not found, please install it with `pip install pydub`"
)
try:
from faster_whisper import WhisperModel
except ImportError:
raise ImportError(
"faster_whisper package not found, please install it with "
"`pip install faster-whisper`"
)
# get the audio
if isinstance(blob.data, bytes):
# blob contains the audio
audio = AudioSegment.from_file(io.BytesIO(blob.data))
elif blob.data is None and blob.path:
# Audio file from disk
audio = AudioSegment.from_file(blob.path)
else:
raise ValueError("Unable to get audio from blob")
file_obj = io.BytesIO(audio.export(format="mp3").read())
# Transcribe
model = WhisperModel(
self.model_size, device=self.device, compute_type="float16"
)
segments, info = model.transcribe(file_obj, beam_size=5)
for segment in segments:
yield Document(
page_content=segment.text,
metadata={
"source": blob.source,
"timestamps": "[%.2fs -> %.2fs]" % (segment.start, segment.end),
"language": info.language,
"probability": "%d%%" % round(info.language_probability * 100),
**blob.metadata,
},
)