mirror of https://github.com/hwchase17/langchain
feat: Add Google Cloud Text-to-Speech Tool (#12572)
- Add Tool for [Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech) - Follows similar structure to [Eleven Labs Text2Speech](https://python.langchain.com/docs/integrations/tools/eleven_labs_tts) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>pull/12537/head^2
parent
1f2c672d4a
commit
e53b9ccd70
@ -0,0 +1,94 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a991a6f8-1897-4f49-a191-ae3bdaeda856",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Google Cloud Text-to-Speech\n",
|
||||
"\n",
|
||||
"This notebook shows how to interact with the `Google Cloud Text-to-Speech API` to achieve speech synthesis capabilities."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9eeb311e-e1bd-4959-8536-4d267f302eb3",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you need to set up an Google Cloud project. You can follow the instructions [here](https://cloud.google.com/text-to-speech/docs/before-you-begin)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "0a309c0e-5310-4eaa-8af9-bcbc252e45da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install google-cloud-text-to-speech"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "434b2454-2bff-484d-822c-4026a9dc1383",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2f57a647-9214-4562-a8cf-f263a15d1f40",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.tools import GoogleCloudTextToSpeechTool\n",
|
||||
"\n",
|
||||
"text_to_speak = \"Hello world!\"\n",
|
||||
"\n",
|
||||
"tts = GoogleCloudTextToSpeechTool()\n",
|
||||
"tts.name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d4613fed-66f0-47c6-be50-7e7670654427",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We can generate audio, save it to the temporary file and then play it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f1984844-aa75-4f83-9d42-1c8052d87cc0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"speech_file = tts.run(text_to_speak)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
"""Google Cloud Tools."""
|
||||
|
||||
from langchain.tools.google_cloud.texttospeech import GoogleCloudTextToSpeechTool
|
||||
|
||||
__all__ = ["GoogleCloudTextToSpeechTool"]
|
@ -0,0 +1,90 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForToolRun
|
||||
from langchain.tools.base import BaseTool
|
||||
from langchain.utilities.vertexai import get_client_info
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from google.cloud import texttospeech
|
||||
|
||||
|
||||
def _import_google_cloud_texttospeech() -> Any:
|
||||
try:
|
||||
from google.cloud import texttospeech
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Cannot import google.cloud.texttospeech, please install "
|
||||
"`pip install google-cloud-texttospeech`."
|
||||
) from e
|
||||
return texttospeech
|
||||
|
||||
|
||||
def _encoding_file_extension_map(encoding: texttospeech.AudioEncoding) -> Optional[str]:
|
||||
texttospeech = _import_google_cloud_texttospeech()
|
||||
|
||||
ENCODING_FILE_EXTENSION_MAP = {
|
||||
texttospeech.AudioEncoding.LINEAR16: ".wav",
|
||||
texttospeech.AudioEncoding.MP3: ".mp3",
|
||||
texttospeech.AudioEncoding.OGG_OPUS: ".ogg",
|
||||
texttospeech.AudioEncoding.MULAW: ".wav",
|
||||
texttospeech.AudioEncoding.ALAW: ".wav",
|
||||
}
|
||||
return ENCODING_FILE_EXTENSION_MAP.get(encoding)
|
||||
|
||||
|
||||
class GoogleCloudTextToSpeechTool(BaseTool):
|
||||
"""Tool that queries the Google Cloud Text to Speech API.
|
||||
|
||||
In order to set this up, follow instructions at:
|
||||
https://cloud.google.com/text-to-speech/docs/before-you-begin
|
||||
"""
|
||||
|
||||
name: str = "google_cloud_texttospeech"
|
||||
description: str = (
|
||||
"A wrapper around Google Cloud Text-to-Speech. "
|
||||
"Useful for when you need to synthesize audio from text. "
|
||||
"It supports multiple languages, including English, German, Polish, "
|
||||
"Spanish, Italian, French, Portuguese, and Hindi. "
|
||||
)
|
||||
|
||||
_client: Any
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
"""Initializes private fields."""
|
||||
texttospeech = _import_google_cloud_texttospeech()
|
||||
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self._client = texttospeech.TextToSpeechClient(
|
||||
client_info=get_client_info(module="text-to-speech")
|
||||
)
|
||||
|
||||
def _run(
|
||||
self,
|
||||
input_text: str,
|
||||
language_code: str = "en-US",
|
||||
ssml_gender: Optional[texttospeech.SsmlVoiceGender] = None,
|
||||
audio_encoding: Optional[texttospeech.AudioEncoding] = None,
|
||||
run_manager: Optional[CallbackManagerForToolRun] = None,
|
||||
) -> str:
|
||||
"""Use the tool."""
|
||||
texttospeech = _import_google_cloud_texttospeech()
|
||||
ssml_gender = ssml_gender or texttospeech.SsmlVoiceGender.NEUTRAL
|
||||
audio_encoding = audio_encoding or texttospeech.AudioEncoding.MP3
|
||||
|
||||
response = self._client.synthesize_speech(
|
||||
input=texttospeech.SynthesisInput(text=input_text),
|
||||
voice=texttospeech.VoiceSelectionParams(
|
||||
language_code=language_code, ssml_gender=ssml_gender
|
||||
),
|
||||
audio_config=texttospeech.AudioConfig(audio_encoding=audio_encoding),
|
||||
)
|
||||
|
||||
suffix = _encoding_file_extension_map(audio_encoding)
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode="bx", suffix=suffix, delete=False) as f:
|
||||
f.write(response.audio_content)
|
||||
return f.name
|
Loading…
Reference in New Issue