community[minor]: Add audio-parser "faster-whisper" in audio.py (#20012)

faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2, which is up to 4 times faster than enai/whisper for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU. It can automatically detect the following 14 languages and transcribe the text into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi, ar, tr. The gitbub repository for faster-whisper is : https://github.com/SYSTRAN/faster-whisper --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2024-11-18 09:25:54 +00:00 · 2024-04-19 04:50:59 +08:00 · 2024-04-19 04:50:59 +08:00 · 7d0a008744
commit 7d0a008744
parent e3c2431c5b
1 changed files with 132 additions and 0 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@ -329,3 +329,135 @@ class YandexSTTParser(BaseBlobParser):
                page_content=res.normalized_text,
                metadata={"source": blob.source},
            )
+
+
+class FasterWhisperParser(BaseBlobParser):
+    """Transcribe and parse audio files with faster-whisper.
+
+    faster-whisper is a reimplementation of OpenAI's Whisper model using CTranslate2,
+    which is up to 4 times faster than openai/whisper for the same accuracy while using
+    less memory. The efficiency can be further improved with 8-bit quantization on both
+    CPU and GPU.
+
+    It can automatically detect the following 14 languages and transcribe the text
+    into their respective languages: en, zh, fr, de, ja, ko, ru, es, th, it, pt, vi,
+    ar, tr.
+
+    The gitbub repository for faster-whisper is :
+    https://github.com/SYSTRAN/faster-whisper
+
+    Example: Load a YouTube video and transcribe the video speech into a document.
+        .. code-block:: python
+
+            from langchain.document_loaders.generic import GenericLoader
+            from langchain_community.document_loaders.parsers.audio
+                import FasterWhisperParser
+            from langchain.document_loaders.blob_loaders.youtube_audio
+                import YoutubeAudioLoader
+
+
+            url="https://www.youtube.com/watch?v=your_video"
+            save_dir="your_dir/"
+            loader = GenericLoader(
+                YoutubeAudioLoader([url],save_dir),
+                FasterWhisperParser()
+            )
+            docs = loader.load()
+
+    """
+
+    def __init__(
+        self,
+        *,
+        device: Optional[str] = "cuda",
+        model_size: Optional[str] = None,
+    ):
+        """Initialize the parser.
+
+        Args:
+            device: It can be "cuda" or "cpu" based on the available device.
+            model_size: There are four model sizes to choose from: "base", "small",
+                        "medium", and "large-v3", based on the available GPU memory.
+        """
+        try:
+            import torch
+        except ImportError:
+            raise ImportError(
+                "torch package not found, please install it with `pip install torch`"
+            )
+
+        # Determine the device to use
+        if device == "cpu":
+            self.device = "cpu"
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        # Determine the model_size
+        if self.device == "cpu":
+            self.model_size = "base"
+        else:
+            # Set the model_size based on the available memory
+            mem = torch.cuda.get_device_properties(self.device).total_memory / (1024**2)
+            if mem < 1000:
+                self.model_size = "base"
+            elif mem < 3000:
+                self.model_size = "small"
+            elif mem < 5000:
+                self.model_size = "medium"
+            else:
+                self.model_size = "large-v3"
+        # If the user has assigned a model size, then use the assigned size
+        if model_size is not None:
+            if model_size in ["base", "small", "medium", "large-v3"]:
+                self.model_size = model_size
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse the blob."""
+
+        import io
+
+        try:
+            from pydub import AudioSegment
+        except ImportError:
+            raise ImportError(
+                "pydub package not found, please install it with `pip install pydub`"
+            )
+
+        try:
+            from faster_whisper import WhisperModel
+        except ImportError:
+            raise ImportError(
+                "faster_whisper package not found, please install it with "
+                "`pip install faster-whisper`"
+            )
+
+        # get the audio
+        if isinstance(blob.data, bytes):
+            # blob contains the audio
+            audio = AudioSegment.from_file(io.BytesIO(blob.data))
+        elif blob.data is None and blob.path:
+            # Audio file from disk
+            audio = AudioSegment.from_file(blob.path)
+        else:
+            raise ValueError("Unable to get audio from blob")
+
+        file_obj = io.BytesIO(audio.export(format="mp3").read())
+
+        # Transcribe
+        model = WhisperModel(
+            self.model_size, device=self.device, compute_type="float16"
+        )
+
+        segments, info = model.transcribe(file_obj, beam_size=5)
+
+        for segment in segments:
+            yield Document(
+                page_content=segment.text,
+                metadata={
+                    "source": blob.source,
+                    "timestamps": "[%.2fs -> %.2fs]" % (segment.start, segment.end),
+                    "language": info.language,
+                    "probability": "%d%%" % round(info.language_probability * 100),
+                    **blob.metadata,
+                },
+            )