diff --git a/libs/community/langchain_community/document_loaders/parsers/audio.py b/libs/community/langchain_community/document_loaders/parsers/audio.py index 4afe339938..31d542e87b 100644 --- a/libs/community/langchain_community/document_loaders/parsers/audio.py +++ b/libs/community/langchain_community/document_loaders/parsers/audio.py @@ -329,3 +329,135 @@ class YandexSTTParser(BaseBlobParser): page_content=res.normalized_text, metadata={"source": blob.source}, ) + + +class FasterWhisperParser(BaseBlobParser): + """Transcribe and parse audio files with faster-whisper. + + faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, + which is up to 4 times faster than openai/whisper for the same accuracy while using + less memory. The efficiency can be further improved with 8-bit quantization on both + CPU and GPU. + + It can automatically detect the following 14 languages and transcribe the text + into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi, + ar, tr. + + The gitbub repository for faster-whisper is : + https://github.com/SYSTRAN/faster-whisper + + Example: Load a YouTube video and transcribe the video speech into a document. + .. code-block:: python + + from langchain.document_loaders.generic import GenericLoader + from langchain_community.document_loaders.parsers.audio + import FasterWhisperParser + from langchain.document_loaders.blob_loaders.youtube_audio + import YoutubeAudioLoader + + + url="https://www.youtube.com/watch?v=your_video" + save_dir="your_dir/" + loader = GenericLoader( + YoutubeAudioLoader([url],save_dir), + FasterWhisperParser() + ) + docs = loader.load() + + """ + + def __init__( + self, + *, + device: Optional[str] = "cuda", + model_size: Optional[str] = None, + ): + """Initialize the parser. + + Args: + device: It can be "cuda" or "cpu" based on the available device. + model_size: There are four model sizes to choose from: "base", "small", + "medium", and "large-v3", based on the available GPU memory. + """ + try: + import torch + except ImportError: + raise ImportError( + "torch package not found, please install it with `pip install torch`" + ) + + # Determine the device to use + if device == "cpu": + self.device = "cpu" + else: + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + # Determine the model_size + if self.device == "cpu": + self.model_size = "base" + else: + # Set the model_size based on the available memory + mem = torch.cuda.get_device_properties(self.device).total_memory / (1024**2) + if mem < 1000: + self.model_size = "base" + elif mem < 3000: + self.model_size = "small" + elif mem < 5000: + self.model_size = "medium" + else: + self.model_size = "large-v3" + # If the user has assigned a model size, then use the assigned size + if model_size is not None: + if model_size in ["base", "small", "medium", "large-v3"]: + self.model_size = model_size + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + + import io + + try: + from pydub import AudioSegment + except ImportError: + raise ImportError( + "pydub package not found, please install it with `pip install pydub`" + ) + + try: + from faster_whisper import WhisperModel + except ImportError: + raise ImportError( + "faster_whisper package not found, please install it with " + "`pip install faster-whisper`" + ) + + # get the audio + if isinstance(blob.data, bytes): + # blob contains the audio + audio = AudioSegment.from_file(io.BytesIO(blob.data)) + elif blob.data is None and blob.path: + # Audio file from disk + audio = AudioSegment.from_file(blob.path) + else: + raise ValueError("Unable to get audio from blob") + + file_obj = io.BytesIO(audio.export(format="mp3").read()) + + # Transcribe + model = WhisperModel( + self.model_size, device=self.device, compute_type="float16" + ) + + segments, info = model.transcribe(file_obj, beam_size=5) + + for segment in segments: + yield Document( + page_content=segment.text, + metadata={ + "source": blob.source, + "timestamps": "[%.2fs -> %.2fs]" % (segment.start, segment.end), + "language": info.language, + "probability": "%d%%" % round(info.language_probability * 100), + **blob.metadata, + }, + )