community[patch]: Skip `OpenAIWhisperParser` extremely small audio chunks to avoid api error (#11450)

**Description**
This PR addresses a rare issue in `OpenAIWhisperParser` that causes it
to crash when processing an audio file with a duration very close to the
class's chunk size threshold of 20 minutes.

**Issue**
#11449

**Dependencies**
None

**Tag maintainer**
@agola11 @eyurtsev 

**Twitter handle**
leonardodiegues

---------

Co-authored-by: Leonardo Diegues <leonardo.diegues@grupofolha.com.br>
Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/17990/head
Leo Diegues 7 months ago committed by GitHub
parent 46505742eb
commit b15fccbb99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -13,10 +13,22 @@ logger = logging.getLogger(__name__)
class OpenAIWhisperParser(BaseBlobParser):
"""Transcribe and parse audio files.
Audio transcription is with OpenAI Whisper model."""
def __init__(self, api_key: Optional[str] = None):
Audio transcription is with OpenAI Whisper model.
Args:
api_key: OpenAI API key
chunk_duration_threshold: minimum duration of a chunk in seconds
NOTE: According to the OpenAI API, the chunk duration should be at least 0.1
seconds. If the chunk duration is less or equal than the threshold,
it will be skipped.
"""
def __init__(
self, api_key: Optional[str] = None, *, chunk_duration_threshold: float = 0.1
):
self.api_key = api_key
self.chunk_duration_threshold = chunk_duration_threshold
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
@ -57,6 +69,9 @@ class OpenAIWhisperParser(BaseBlobParser):
for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
# Audio chunk
chunk = audio[i : i + chunk_duration_ms]
# Skip chunks that are too short to transcribe
if chunk.duration_seconds <= self.chunk_duration_threshold:
continue
file_obj = io.BytesIO(chunk.export(format="mp3").read())
if blob.source is not None:
file_obj.name = blob.source + f"_part_{split_number}.mp3"

Loading…
Cancel
Save