mirror of https://github.com/hwchase17/langchain
feat: Add Google Speech to Text API Document Loader (#12298)
- Add Document Loader for Google Speech to Text - Similar Structure to [Assembly AI Document Loader][1] [1]: https://python.langchain.com/docs/integrations/document_loaders/assemblyaipull/12092/head^2
parent
52c194ec3a
commit
134f085824
File diff suppressed because one or more lines are too long
@ -0,0 +1,136 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.utilities.vertexai import get_client_info
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from google.cloud.speech_v2 import RecognitionConfig
|
||||
from google.protobuf.field_mask_pb2 import FieldMask
|
||||
|
||||
|
||||
class GoogleSpeechToTextLoader(BaseLoader):
|
||||
"""
|
||||
Loader for Google Cloud Speech-to-Text audio transcripts.
|
||||
|
||||
It uses the Google Cloud Speech-to-Text API to transcribe audio files
|
||||
and loads the transcribed text into one or more Documents,
|
||||
depending on the specified format.
|
||||
|
||||
To use, you should have the ``google-cloud-speech`` python package installed.
|
||||
|
||||
Audio files can be specified via a Google Cloud Storage uri or a local file path.
|
||||
|
||||
For a detailed explanation of Google Cloud Speech-to-Text, refer to the product
|
||||
documentation.
|
||||
https://cloud.google.com/speech-to-text
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
project_id: str,
|
||||
file_path: str,
|
||||
location: str = "us-central1",
|
||||
recognizer_id: str = "_",
|
||||
config: Optional[RecognitionConfig] = None,
|
||||
config_mask: Optional[FieldMask] = None,
|
||||
):
|
||||
"""
|
||||
Initializes the GoogleSpeechToTextLoader.
|
||||
|
||||
Args:
|
||||
project_id: Google Cloud Project ID.
|
||||
file_path: A Google Cloud Storage URI or a local file path.
|
||||
location: Speech-to-Text recognizer location.
|
||||
recognizer_id: Speech-to-Text recognizer id.
|
||||
config: Recognition options and features.
|
||||
For more information:
|
||||
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognitionConfig
|
||||
config_mask: The list of fields in config that override the values in the
|
||||
``default_recognition_config`` of the recognizer during this
|
||||
recognition request.
|
||||
For more information:
|
||||
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognizeRequest
|
||||
"""
|
||||
try:
|
||||
from google.api_core.client_options import ClientOptions
|
||||
from google.cloud.speech_v2 import (
|
||||
AutoDetectDecodingConfig,
|
||||
RecognitionConfig,
|
||||
RecognitionFeatures,
|
||||
SpeechClient,
|
||||
)
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Could not import google-cloud-speech python package. "
|
||||
"Please install it with `pip install google-cloud-speech`."
|
||||
) from exc
|
||||
|
||||
self.project_id = project_id
|
||||
self.file_path = file_path
|
||||
self.location = location
|
||||
self.recognizer_id = recognizer_id
|
||||
# Config must be set in speech recognition request.
|
||||
self.config = config or RecognitionConfig(
|
||||
auto_decoding_config=AutoDetectDecodingConfig(),
|
||||
language_codes=["en-US"],
|
||||
model="chirp",
|
||||
features=RecognitionFeatures(
|
||||
# Automatic punctuation could be useful for language applications
|
||||
enable_automatic_punctuation=True,
|
||||
),
|
||||
)
|
||||
self.config_mask = config_mask
|
||||
|
||||
self._client = SpeechClient(
|
||||
client_info=get_client_info(module="speech-to-text"),
|
||||
client_options=(
|
||||
ClientOptions(api_endpoint=f"{location}-speech.googleapis.com")
|
||||
if location != "global"
|
||||
else None
|
||||
),
|
||||
)
|
||||
self._recognizer_path = self._client.recognizer_path(
|
||||
project_id, location, recognizer_id
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Transcribes the audio file and loads the transcript into documents.
|
||||
|
||||
It uses the Google Cloud Speech-to-Text API to transcribe the audio file
|
||||
and blocks until the transcription is finished.
|
||||
"""
|
||||
try:
|
||||
from google.cloud.speech_v2 import RecognizeRequest
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Could not import google-cloud-speech python package. "
|
||||
"Please install it with `pip install google-cloud-speech`."
|
||||
) from exc
|
||||
|
||||
request = RecognizeRequest(
|
||||
recognizer=self._recognizer_path,
|
||||
config=self.config,
|
||||
config_mask=self.config_mask,
|
||||
)
|
||||
|
||||
if "gs://" in self.file_path:
|
||||
request.uri = self.file_path
|
||||
else:
|
||||
with open(self.file_path, "rb") as f:
|
||||
request.content = f.read()
|
||||
|
||||
response = self._client.recognize(request=request)
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=result.alternatives[0].transcript,
|
||||
metadata={
|
||||
"language_code": result.language_code,
|
||||
"result_end_offset": result.result_end_offset,
|
||||
},
|
||||
)
|
||||
for result in response.results
|
||||
]
|
@ -0,0 +1,34 @@
|
||||
"""Test Google Speech-to-Text document loader.
|
||||
|
||||
You need to create a Google Cloud project and enable the Speech-to-Text API to run the
|
||||
integration tests.
|
||||
Follow the instructions in the example notebook:
|
||||
google_speech_to_text.ipynb
|
||||
to set up the app and configure authentication.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.google_speech_to_text import GoogleSpeechToTextLoader
|
||||
|
||||
|
||||
@pytest.mark.requires("google_api_core")
|
||||
def test_initialization() -> None:
|
||||
loader = GoogleSpeechToTextLoader(
|
||||
project_id="test_project_id", file_path="./testfile.mp3"
|
||||
)
|
||||
assert loader.project_id == "test_project_id"
|
||||
assert loader.file_path == "./testfile.mp3"
|
||||
assert loader.location == "us-central1"
|
||||
assert loader.recognizer_id == "_"
|
||||
|
||||
|
||||
@pytest.mark.requires("google.api_core")
|
||||
def test_load() -> None:
|
||||
loader = GoogleSpeechToTextLoader(
|
||||
project_id="test_project_id", file_path="./testfile.mp3"
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
assert docs[0].page_content == "Test transcription text"
|
||||
assert docs[0].metadata["language_code"] == "en-US"
|
Loading…
Reference in New Issue