community[patch]: Skip OpenAIWhisperParser extremely small audio chunks to avoid api error (#11450)

**Description** This PR addresses a rare issue in `OpenAIWhisperParser` that causes it to crash when processing an audio file with a duration very close to the class's chunk size threshold of 20 minutes. **Issue** #11449 **Dependencies** None **Tag maintainer** @agola11 @eyurtsev **Twitter handle** leonardodiegues --------- Co-authored-by: Leonardo Diegues <leonardo.diegues@grupofolha.com.br> Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-11-18 09:25:54 +00:00 · 2024-02-22 22:02:43 -03:00 · 2024-02-22 22:02:43 -03:00 · b15fccbb99
commit b15fccbb99
parent 46505742eb
1 changed files with 17 additions and 2 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/audio.py
+++ b/libs/community/langchain_community/document_loaders/parsers/audio.py
@ -13,10 +13,22 @@ logger = logging.getLogger(__name__)

 class OpenAIWhisperParser(BaseBlobParser):
    """Transcribe and parse audio files.
-    Audio transcription is with OpenAI Whisper model."""

-    def __init__(self, api_key: Optional[str] = None):
+    Audio transcription is with OpenAI Whisper model.
+
+    Args:
+        api_key: OpenAI API key
+        chunk_duration_threshold: minimum duration of a chunk in seconds
+            NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
+            seconds. If the chunk duration is less or equal than the threshold,
+            it will be skipped.
+    """
+
+    def __init__(
+        self, api_key: Optional[str] = None, *, chunk_duration_threshold: float = 0.1
+    ):
        self.api_key = api_key
+        self.chunk_duration_threshold = chunk_duration_threshold

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""
@ -57,6 +69,9 @@ class OpenAIWhisperParser(BaseBlobParser):
        for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
            # Audio chunk
            chunk = audio[i : i + chunk_duration_ms]
+            # Skip chunks that are too short to transcribe
+            if chunk.duration_seconds <= self.chunk_duration_threshold:
+                continue
            file_obj = io.BytesIO(chunk.export(format="mp3").read())
            if blob.source is not None:
                file_obj.name = blob.source + f"_part_{split_number}.mp3"