feat: Add Google Cloud Text-to-Speech Tool (#12572)

- Add Tool for [Google Cloud Text-to-Speech](https://cloud.google.com/text-to-speech) - Follows similar structure to [Eleven Labs Text2Speech](https://python.langchain.com/docs/integrations/tools/eleven_labs_tts) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
9 months ago · e53b9ccd70
parent 1f2c672d4a
commit e53b9ccd70
6 changed files with 205 additions and 0 deletions
--- a/docs/docs/integrations/tools/google_cloud_texttospeech.ipynb
+++ b/docs/docs/integrations/tools/google_cloud_texttospeech.ipynb
@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a991a6f8-1897-4f49-a191-ae3bdaeda856",
+   "metadata": {},
+   "source": [
+    "# Google Cloud Text-to-Speech\n",
+    "\n",
+    "This notebook shows how to interact with the `Google Cloud Text-to-Speech API` to achieve speech synthesis capabilities."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9eeb311e-e1bd-4959-8536-4d267f302eb3",
+   "metadata": {},
+   "source": [
+    "First, you need to set up an Google Cloud project. You can follow the instructions [here](https://cloud.google.com/text-to-speech/docs/before-you-begin)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0a309c0e-5310-4eaa-8af9-bcbc252e45da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install google-cloud-text-to-speech"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "434b2454-2bff-484d-822c-4026a9dc1383",
+   "metadata": {},
+   "source": [
+    "## Usage"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f57a647-9214-4562-a8cf-f263a15d1f40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.tools import GoogleCloudTextToSpeechTool\n",
+    "\n",
+    "text_to_speak = \"Hello world!\"\n",
+    "\n",
+    "tts = GoogleCloudTextToSpeechTool()\n",
+    "tts.name"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4613fed-66f0-47c6-be50-7e7670654427",
+   "metadata": {},
+   "source": [
+    "We can generate audio, save it to the temporary file and then play it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f1984844-aa75-4f83-9d42-1c8052d87cc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speech_file = tts.run(text_to_speak)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/agents/load_tools.py
+++ b/libs/langchain/langchain/agents/load_tools.py
@ -33,6 +33,7 @@ from langchain.tools.pubmed.tool import PubmedQueryRun
 from langchain.tools.base import BaseTool
 from langchain.tools.bing_search.tool import BingSearchRun
 from langchain.tools.ddg_search.tool import DuckDuckGoSearchRun
+from langchain.tools.google_cloud.texttospeech import GoogleCloudTextToSpeechTool
 from langchain.tools.google_search.tool import GoogleSearchResults, GoogleSearchRun
 from langchain.tools.google_scholar.tool import GoogleScholarQueryRun
 from langchain.tools.metaphor_search.tool import MetaphorSearchResults
@ -326,6 +327,10 @@ def _get_eleven_labs_text2speech(**kwargs: Any) -> BaseTool:
    return ElevenLabsText2SpeechTool(**kwargs)


+def _get_google_cloud_texttospeech(**kwargs: Any) -> BaseTool:
+    return GoogleCloudTextToSpeechTool(**kwargs)
+
+
 _EXTRA_LLM_TOOLS: Dict[
    str,
    Tuple[Callable[[Arg(BaseLanguageModel, "llm"), KwArg(Any)], BaseTool], List[str]],
@ -390,6 +395,7 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st
        ["api_login", "api_password", "aiosession"],
    ),
    "eleven_labs_text2speech": (_get_eleven_labs_text2speech, ["eleven_api_key"]),
+    "google_cloud_texttospeech": (_get_google_cloud_texttospeech, []),
 }


--- a/libs/langchain/langchain/tools/init.py
+++ b/libs/langchain/langchain/tools/init.py
@ -240,6 +240,12 @@ def _import_gmail_GmailSendMessage() -> Any:
    return GmailSendMessage


+def _import_google_cloud_texttospeech() -> Any:
+    from langchain.tools.google_cloud.texttospeech import GoogleCloudTextToSpeechTool
+
+    return GoogleCloudTextToSpeechTool
+
+
 def _import_google_places_tool() -> Any:
    from langchain.tools.google_places.tool import GooglePlacesTool

@ -731,6 +737,8 @@ def __getattr__(name: str) -> Any:
        return _import_gmail_GmailSearch()
    elif name == "GmailSendMessage":
        return _import_gmail_GmailSendMessage()
+    elif name == "GoogleCloudTextToSpeechTool":
+        return _import_google_cloud_texttospeech()
    elif name == "GooglePlacesTool":
        return _import_google_places_tool()
    elif name == "GoogleSearchResults":
@ -916,6 +924,7 @@ __all__ = [
    "GmailGetThread",
    "GmailSearch",
    "GmailSendMessage",
+    "GoogleCloudTextToSpeechTool",
    "GooglePlacesTool",
    "GoogleSearchResults",
    "GoogleSearchRun",
--- a/libs/langchain/langchain/tools/google_cloud/init.py
+++ b/libs/langchain/langchain/tools/google_cloud/init.py
@ -0,0 +1,5 @@
+"""Google Cloud Tools."""
+
+from langchain.tools.google_cloud.texttospeech import GoogleCloudTextToSpeechTool
+
+__all__ = ["GoogleCloudTextToSpeechTool"]
--- a/libs/langchain/langchain/tools/google_cloud/texttospeech.py
+++ b/libs/langchain/langchain/tools/google_cloud/texttospeech.py
@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import tempfile
+from typing import TYPE_CHECKING, Any, Optional
+
+from langchain.callbacks.manager import CallbackManagerForToolRun
+from langchain.tools.base import BaseTool
+from langchain.utilities.vertexai import get_client_info
+
+if TYPE_CHECKING:
+    from google.cloud import texttospeech
+
+
+def _import_google_cloud_texttospeech() -> Any:
+    try:
+        from google.cloud import texttospeech
+    except ImportError as e:
+        raise ImportError(
+            "Cannot import google.cloud.texttospeech, please install "
+            "`pip install google-cloud-texttospeech`."
+        ) from e
+    return texttospeech
+
+
+def _encoding_file_extension_map(encoding: texttospeech.AudioEncoding) -> Optional[str]:
+    texttospeech = _import_google_cloud_texttospeech()
+
+    ENCODING_FILE_EXTENSION_MAP = {
+        texttospeech.AudioEncoding.LINEAR16: ".wav",
+        texttospeech.AudioEncoding.MP3: ".mp3",
+        texttospeech.AudioEncoding.OGG_OPUS: ".ogg",
+        texttospeech.AudioEncoding.MULAW: ".wav",
+        texttospeech.AudioEncoding.ALAW: ".wav",
+    }
+    return ENCODING_FILE_EXTENSION_MAP.get(encoding)
+
+
+class GoogleCloudTextToSpeechTool(BaseTool):
+    """Tool that queries the Google Cloud Text to Speech API.
+
+    In order to set this up, follow instructions at:
+    https://cloud.google.com/text-to-speech/docs/before-you-begin
+    """
+
+    name: str = "google_cloud_texttospeech"
+    description: str = (
+        "A wrapper around Google Cloud Text-to-Speech. "
+        "Useful for when you need to synthesize audio from text. "
+        "It supports multiple languages, including English, German, Polish, "
+        "Spanish, Italian, French, Portuguese, and Hindi. "
+    )
+
+    _client: Any
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initializes private fields."""
+        texttospeech = _import_google_cloud_texttospeech()
+
+        super().__init__(**kwargs)
+
+        self._client = texttospeech.TextToSpeechClient(
+            client_info=get_client_info(module="text-to-speech")
+        )
+
+    def _run(
+        self,
+        input_text: str,
+        language_code: str = "en-US",
+        ssml_gender: Optional[texttospeech.SsmlVoiceGender] = None,
+        audio_encoding: Optional[texttospeech.AudioEncoding] = None,
+        run_manager: Optional[CallbackManagerForToolRun] = None,
+    ) -> str:
+        """Use the tool."""
+        texttospeech = _import_google_cloud_texttospeech()
+        ssml_gender = ssml_gender or texttospeech.SsmlVoiceGender.NEUTRAL
+        audio_encoding = audio_encoding or texttospeech.AudioEncoding.MP3
+
+        response = self._client.synthesize_speech(
+            input=texttospeech.SynthesisInput(text=input_text),
+            voice=texttospeech.VoiceSelectionParams(
+                language_code=language_code, ssml_gender=ssml_gender
+            ),
+            audio_config=texttospeech.AudioConfig(audio_encoding=audio_encoding),
+        )
+
+        suffix = _encoding_file_extension_map(audio_encoding)
+
+        with tempfile.NamedTemporaryFile(mode="bx", suffix=suffix, delete=False) as f:
+            f.write(response.audio_content)
+        return f.name
--- a/libs/langchain/tests/unit_tests/tools/test_public_api.py
+++ b/libs/langchain/tests/unit_tests/tools/test_public_api.py
@ -46,6 +46,7 @@ _EXPECTED = [
    "GmailGetThread",
    "GmailSearch",
    "GmailSendMessage",
+    "GoogleCloudTextToSpeechTool",
    "GooglePlacesTool",
    "GoogleSearchResults",
    "GoogleSearchRun",