langchain/libs/community/langchain_community/document_loaders/parsers/audio.py

import logging
import time
from typing import Dict, Iterator, Optional, Tuple

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.utils.openai import is_openai_v1

logger = logging.getLogger(__name__)


class OpenAIWhisperParser(BaseBlobParser):
    """Transcribe and parse audio files.
    Audio transcription is with OpenAI Whisper model."""

    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

        import io

        try:
            import openai
        except ImportError:
            raise ImportError(
                "openai package not found, please install it with "
                "`pip install openai`"
            )
        try:
            from pydub import AudioSegment
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with " "`pip install pydub`"
            )

        if is_openai_v1():
            # api_key optional, defaults to `os.environ['OPENAI_API_KEY']`
            client = openai.OpenAI(api_key=self.api_key)
        else:
            # Set the API key if provided
            if self.api_key:
                openai.api_key = self.api_key

        # Audio file from disk
        audio = AudioSegment.from_file(blob.path)

        # Define the duration of each chunk in minutes
        # Need to meet 25MB size limit for Whisper API
        chunk_duration = 20
        chunk_duration_ms = chunk_duration * 60 * 1000

        # Split the audio into chunk_duration_ms chunks
        for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):
            # Audio chunk
            chunk = audio[i : i + chunk_duration_ms]
            file_obj = io.BytesIO(chunk.export(format="mp3").read())
            if blob.source is not None:
                file_obj.name = blob.source + f"_part_{split_number}.mp3"
            else:
                file_obj.name = f"part_{split_number}.mp3"

            # Transcribe
            print(f"Transcribing part {split_number+1}!")
            attempts = 0
            while attempts < 3:
                try:
                    if is_openai_v1():
                        transcript = client.audio.transcriptions.create(
                            model="whisper-1", file=file_obj
                        )
                    else:
                        transcript = openai.Audio.transcribe("whisper-1", file_obj)
                    break
                except Exception as e:
                    attempts += 1
                    print(f"Attempt {attempts} failed. Exception: {str(e)}")
                    time.sleep(5)
            else:
                print("Failed to transcribe after 3 attempts.")
                continue

            yield Document(
                page_content=transcript.text,
                metadata={"source": blob.source, "chunk": split_number},
            )


class OpenAIWhisperParserLocal(BaseBlobParser):
    """Transcribe and parse audio files with OpenAI Whisper model.

    Audio transcription with OpenAI Whisper model locally from transformers.

    Parameters:
    device - device to use
        NOTE: By default uses the gpu if available,
        if you want to use cpu, please set device = "cpu"
    lang_model - whisper model to use, for example "openai/whisper-medium"
    forced_decoder_ids - id states for decoder in multilanguage model,
        usage example:
        from transformers import WhisperProcessor
        processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
          task="transcribe")
        forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",
        task="translate")


    """

    def __init__(
        self,
        device: str = "0",
        lang_model: Optional[str] = None,
        forced_decoder_ids: Optional[Tuple[Dict]] = None,
    ):
        """Initialize the parser.

        Args:
            device: device to use.
            lang_model: whisper model to use, for example "openai/whisper-medium".
              Defaults to None.
            forced_decoder_ids: id states for decoder in a multilanguage model.
              Defaults to None.
        """
        try:
            from transformers import pipeline
        except ImportError:
            raise ImportError(
                "transformers package not found, please install it with "
                "`pip install transformers`"
            )
        try:
            import torch
        except ImportError:
            raise ImportError(
                "torch package not found, please install it with " "`pip install torch`"
            )

        # set device, cpu by default check if there is a GPU available
        if device == "cpu":
            self.device = "cpu"
            if lang_model is not None:
                self.lang_model = lang_model
                print("WARNING! Model override. Using model: ", self.lang_model)
            else:
                # unless overridden, use the small base model on cpu
                self.lang_model = "openai/whisper-base"
        else:
            if torch.cuda.is_available():
                self.device = "cuda:0"
                # check GPU memory and select automatically the model
                mem = torch.cuda.get_device_properties(self.device).total_memory / (
                    1024**2
                )
                if mem < 5000:
                    rec_model = "openai/whisper-base"
                elif mem < 7000:
                    rec_model = "openai/whisper-small"
                elif mem < 12000:
                    rec_model = "openai/whisper-medium"
                else:
                    rec_model = "openai/whisper-large"

                # check if model is overridden
                if lang_model is not None:
                    self.lang_model = lang_model
                    print("WARNING! Model override. Might not fit in your GPU")
                else:
                    self.lang_model = rec_model
            else:
                "cpu"

        print("Using the following model: ", self.lang_model)

        # load model for inference
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=self.lang_model,
            chunk_length_s=30,
            device=self.device,
        )
        if forced_decoder_ids is not None:
            try:
                self.pipe.model.config.forced_decoder_ids = forced_decoder_ids
            except Exception as exception_text:
                logger.info(
                    "Unable to set forced_decoder_ids parameter for whisper model"
                    f"Text of exception: {exception_text}"
                    "Therefore whisper model will use default mode for decoder"
                )

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

        import io

        try:
            from pydub import AudioSegment
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with `pip install pydub`"
            )

        try:
            import librosa
        except ImportError:
            raise ImportError(
                "librosa package not found, please install it with "
                "`pip install librosa`"
            )

        # Audio file from disk
        audio = AudioSegment.from_file(blob.path)

        file_obj = io.BytesIO(audio.export(format="mp3").read())

        # Transcribe
        print(f"Transcribing part {blob.path}!")

        y, sr = librosa.load(file_obj, sr=16000)

        prediction = self.pipe(y.copy(), batch_size=8)["text"]

        yield Document(
            page_content=prediction,
            metadata={"source": blob.source},
        )


class YandexSTTParser(BaseBlobParser):
    """Transcribe and parse audio files.
    Audio transcription is with OpenAI Whisper model."""

    def __init__(
        self,
        *,
        api_key: Optional[str] = None,
        iam_token: Optional[str] = None,
        model: str = "general",
        language: str = "auto",
    ):
        """Initialize the parser.

        Args:
            api_key: API key for a service account
            with the `ai.speechkit-stt.user` role.
            iam_token: IAM token for a service account
            with the `ai.speechkit-stt.user` role.
            model: Recognition model name.
              Defaults to general.
            language: The language in ISO 639-1 format.
              Defaults to automatic language recognition.
        Either `api_key` or `iam_token` must be provided, but not both.
        """
        if (api_key is None) == (iam_token is None):
            raise ValueError(
                "Either 'api_key' or 'iam_token' must be provided, but not both."
            )
        self.api_key = api_key
        self.iam_token = iam_token
        self.model = model
        self.language = language

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""

        try:
            from speechkit import configure_credentials, creds, model_repository
            from speechkit.stt import AudioProcessingType
        except ImportError:
            raise ImportError(
                "yandex-speechkit package not found, please install it with "
                "`pip install yandex-speechkit`"
            )
        try:
            from pydub import AudioSegment
        except ImportError:
            raise ImportError(
                "pydub package not found, please install it with " "`pip install pydub`"
            )

        if self.api_key:
            configure_credentials(
                yandex_credentials=creds.YandexCredentials(api_key=self.api_key)
            )
        else:
            configure_credentials(
                yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token)
            )

        audio = AudioSegment.from_file(blob.path)

        model = model_repository.recognition_model()

        model.model = self.model
        model.language = self.language
        model.audio_processing_type = AudioProcessingType.Full

        result = model.transcribe(audio)

        for res in result:
            yield Document(
                page_content=res.normalized_text,
                metadata={"source": blob.source},
            )
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import logging`
			`import time`
			`from typing import Dict, Iterator, Optional, Tuple`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseBlobParser`
			`from langchain_community.document_loaders.blob_loaders import Blob`
			`from langchain_community.utils.openai import is_openai_v1`

			`logger = logging.getLogger(__name__)`


			`class OpenAIWhisperParser(BaseBlobParser):`
			`"""Transcribe and parse audio files.`
			`Audio transcription is with OpenAI Whisper model."""`

			`def __init__(self, api_key: Optional[str] = None):`
			`self.api_key = api_key`

			`def lazy_parse(self, blob: Blob) -> Iterator[Document]:`
			`"""Lazily parse the blob."""`

			`import io`

			`try:`
			`import openai`
			`except ImportError:`
			`raise ImportError(`
			`"openai package not found, please install it with "`
			"`pip install openai`"
			`)`
			`try:`
			`from pydub import AudioSegment`
			`except ImportError:`
			`raise ImportError(`
			"pydub package not found, please install it with " "`pip install pydub`"
			`)`

			`if is_openai_v1():`
			# api_key optional, defaults to `os.environ['OPENAI_API_KEY']`
			`client = openai.OpenAI(api_key=self.api_key)`
			`else:`
			`# Set the API key if provided`
			`if self.api_key:`
			`openai.api_key = self.api_key`

			`# Audio file from disk`
			`audio = AudioSegment.from_file(blob.path)`

			`# Define the duration of each chunk in minutes`
			`# Need to meet 25MB size limit for Whisper API`
			`chunk_duration = 20`
			`chunk_duration_ms = chunk_duration * 60 * 1000`

			`# Split the audio into chunk_duration_ms chunks`
			`for split_number, i in enumerate(range(0, len(audio), chunk_duration_ms)):`
			`# Audio chunk`
			`chunk = audio[i : i + chunk_duration_ms]`
			`file_obj = io.BytesIO(chunk.export(format="mp3").read())`
			`if blob.source is not None:`
			`file_obj.name = blob.source + f"_part_{split_number}.mp3"`
			`else:`
			`file_obj.name = f"part_{split_number}.mp3"`

			`# Transcribe`
			`print(f"Transcribing part {split_number+1}!")`
			`attempts = 0`
			`while attempts < 3:`
			`try:`
			`if is_openai_v1():`
			`transcript = client.audio.transcriptions.create(`
			`model="whisper-1", file=file_obj`
			`)`
			`else:`
			`transcript = openai.Audio.transcribe("whisper-1", file_obj)`
			`break`
			`except Exception as e:`
			`attempts += 1`
			`print(f"Attempt {attempts} failed. Exception: {str(e)}")`
			`time.sleep(5)`
			`else:`
			`print("Failed to transcribe after 3 attempts.")`
			`continue`

			`yield Document(`
			`page_content=transcript.text,`
			`metadata={"source": blob.source, "chunk": split_number},`
			`)`


			`class OpenAIWhisperParserLocal(BaseBlobParser):`
			`"""Transcribe and parse audio files with OpenAI Whisper model.`

			`Audio transcription with OpenAI Whisper model locally from transformers.`

			`Parameters:`
			`device - device to use`
			`NOTE: By default uses the gpu if available,`
			`if you want to use cpu, please set device = "cpu"`
			`lang_model - whisper model to use, for example "openai/whisper-medium"`
			`forced_decoder_ids - id states for decoder in multilanguage model,`
			`usage example:`
			`from transformers import WhisperProcessor`
			`processor = WhisperProcessor.from_pretrained("openai/whisper-medium")`
			`forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",`
			`task="transcribe")`
			`forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="french",`
			`task="translate")`



			`"""`

			`def __init__(`
			`self,`
			`device: str = "0",`
			`lang_model: Optional[str] = None,`
			`forced_decoder_ids: Optional[Tuple[Dict]] = None,`
			`):`
			`"""Initialize the parser.`

			`Args:`
			`device: device to use.`
			`lang_model: whisper model to use, for example "openai/whisper-medium".`
			`Defaults to None.`
			`forced_decoder_ids: id states for decoder in a multilanguage model.`
			`Defaults to None.`
			`"""`
			`try:`
			`from transformers import pipeline`
			`except ImportError:`
			`raise ImportError(`
			`"transformers package not found, please install it with "`
			"`pip install transformers`"
			`)`
			`try:`
			`import torch`
			`except ImportError:`
			`raise ImportError(`
			"torch package not found, please install it with " "`pip install torch`"
			`)`

			`# set device, cpu by default check if there is a GPU available`
			`if device == "cpu":`
			`self.device = "cpu"`
			`if lang_model is not None:`
			`self.lang_model = lang_model`
			`print("WARNING! Model override. Using model: ", self.lang_model)`
			`else:`
			`# unless overridden, use the small base model on cpu`
			`self.lang_model = "openai/whisper-base"`
			`else:`
			`if torch.cuda.is_available():`
			`self.device = "cuda:0"`
			`# check GPU memory and select automatically the model`
			`mem = torch.cuda.get_device_properties(self.device).total_memory / (`
			`1024**2`
			`)`
			`if mem < 5000:`
			`rec_model = "openai/whisper-base"`
			`elif mem < 7000:`
			`rec_model = "openai/whisper-small"`
			`elif mem < 12000:`
			`rec_model = "openai/whisper-medium"`
			`else:`
			`rec_model = "openai/whisper-large"`

			`# check if model is overridden`
			`if lang_model is not None:`
			`self.lang_model = lang_model`
			`print("WARNING! Model override. Might not fit in your GPU")`
			`else:`
			`self.lang_model = rec_model`
			`else:`
			`"cpu"`

			`print("Using the following model: ", self.lang_model)`

			`# load model for inference`
			`self.pipe = pipeline(`
			`"automatic-speech-recognition",`
			`model=self.lang_model,`
			`chunk_length_s=30,`
			`device=self.device,`
			`)`
			`if forced_decoder_ids is not None:`
			`try:`
			`self.pipe.model.config.forced_decoder_ids = forced_decoder_ids`
			`except Exception as exception_text:`
			`logger.info(`
			`"Unable to set forced_decoder_ids parameter for whisper model"`
			`f"Text of exception: {exception_text}"`
			`"Therefore whisper model will use default mode for decoder"`
			`)`

			`def lazy_parse(self, blob: Blob) -> Iterator[Document]:`
			`"""Lazily parse the blob."""`

			`import io`

			`try:`
			`from pydub import AudioSegment`
			`except ImportError:`
			`raise ImportError(`
			"pydub package not found, please install it with `pip install pydub`"
			`)`

			`try:`
			`import librosa`
			`except ImportError:`
			`raise ImportError(`
			`"librosa package not found, please install it with "`
			"`pip install librosa`"
			`)`

			`# Audio file from disk`
			`audio = AudioSegment.from_file(blob.path)`

			`file_obj = io.BytesIO(audio.export(format="mp3").read())`

			`# Transcribe`
			`print(f"Transcribing part {blob.path}!")`

			`y, sr = librosa.load(file_obj, sr=16000)`

			`prediction = self.pipe(y.copy(), batch_size=8)["text"]`

			`yield Document(`
			`page_content=prediction,`
			`metadata={"source": blob.source},`
			`)`


			`class YandexSTTParser(BaseBlobParser):`
			`"""Transcribe and parse audio files.`
			`Audio transcription is with OpenAI Whisper model."""`

			`def __init__(`
			`self,`
			`*,`
			`api_key: Optional[str] = None,`
			`iam_token: Optional[str] = None,`
			`model: str = "general",`
			`language: str = "auto",`
			`):`
			`"""Initialize the parser.`

			`Args:`
			`api_key: API key for a service account`
			with the `ai.speechkit-stt.user` role.
			`iam_token: IAM token for a service account`
			with the `ai.speechkit-stt.user` role.
			`model: Recognition model name.`
			`Defaults to general.`
			`language: The language in ISO 639-1 format.`
			`Defaults to automatic language recognition.`
			Either `api_key` or `iam_token` must be provided, but not both.
			`"""`
			`if (api_key is None) == (iam_token is None):`
			`raise ValueError(`
			`"Either 'api_key' or 'iam_token' must be provided, but not both."`
			`)`
			`self.api_key = api_key`
			`self.iam_token = iam_token`
			`self.model = model`
			`self.language = language`

			`def lazy_parse(self, blob: Blob) -> Iterator[Document]:`
			`"""Lazily parse the blob."""`

			`try:`
			`from speechkit import configure_credentials, creds, model_repository`
			`from speechkit.stt import AudioProcessingType`
			`except ImportError:`
			`raise ImportError(`
			`"yandex-speechkit package not found, please install it with "`
			"`pip install yandex-speechkit`"
			`)`
			`try:`
			`from pydub import AudioSegment`
			`except ImportError:`
			`raise ImportError(`
			"pydub package not found, please install it with " "`pip install pydub`"
			`)`

			`if self.api_key:`
			`configure_credentials(`
			`yandex_credentials=creds.YandexCredentials(api_key=self.api_key)`
			`)`
			`else:`
			`configure_credentials(`
			`yandex_credentials=creds.YandexCredentials(iam_token=self.iam_token)`
			`)`

			`audio = AudioSegment.from_file(blob.path)`

			`model = model_repository.recognition_model()`

			`model.model = self.model`
			`model.language = self.language`
			`model.audio_processing_type = AudioProcessingType.Full`

			`result = model.transcribe(audio)`

			`for res in result:`
			`yield Document(`
			`page_content=res.normalized_text,`
			`metadata={"source": blob.source},`
			`)`