Add ElevenLabs text to speech tool (#10525)

2024-11-06 03:20:49 +00:00 · 2023-09-12 23:11:04 -07:00 · 2023-09-12 23:11:04 -07:00 · 303724980c
commit 303724980c
parent eaf916f999 79a567d885
7 changed files with 328 additions and 0 deletions
--- a/docs/extras/integrations/tools/eleven_labs_tts.ipynb
+++ b/docs/extras/integrations/tools/eleven_labs_tts.ipynb
@ -0,0 +1,226 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a991a6f8-1897-4f49-a191-ae3bdaeda856",
   "metadata": {},
   "source": [
    "# Eleven Labs Text2Speech\n",
    "\n",
    "This notebook shows how to interact with the `ElevenLabs API` to achieve text-to-speech capabilities."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9eeb311e-e1bd-4959-8536-4d267f302eb3",
   "metadata": {},
   "source": [
    "First, you need to set up an ElevenLabs account. You can follow the instructions [here](https://docs.elevenlabs.io/welcome/introduction)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0a309c0e-5310-4eaa-8af9-bcbc252e45da",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install elevenlabs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f097c3b1-f761-43cb-aad0-8ba2e93e5f5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"ELEVEN_API_KEY\"] = \"\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "434b2454-2bff-484d-822c-4026a9dc1383",
   "metadata": {},
   "source": [
    "## Usage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2f57a647-9214-4562-a8cf-f263a15d1f40",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'eleven_labs_text2speech'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.tools import ElevenLabsText2SpeechTool\n",
    "\n",
    "text_to_speak = \"Hello world! I am the real slim shady\"\n",
    "\n",
    "tts = ElevenLabsText2SpeechTool()\n",
    "tts.name"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4613fed-66f0-47c6-be50-7e7670654427",
   "metadata": {},
   "source": [
    "We can generate audio, save it to the temporary file and then play it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f1984844-aa75-4f83-9d42-1c8052d87cc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "speech_file = tts.run(text_to_speak)\n",
    "tts.play(speech_file)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42d89cd4-ac2a-4857-9787-c9018b4a8782",
   "metadata": {},
   "source": [
    "Or stream audio directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645",
   "metadata": {},
   "outputs": [],
   "source": [
    "tts.stream_speech(text_to_speak)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a152766d-5f06-48b1-ac89-b4e8d88d3c9f",
   "metadata": {},
   "source": [
    "## Use within an Agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain import OpenAI\n",
    "from langchain.agents import initialize_agent, AgentType, load_tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = OpenAI(temperature=0)\n",
    "tools = load_tools([\"eleven_labs_text2speech\"])\n",
    "agent = initialize_agent(\n",
    "    tools=tools,\n",
    "    llm=llm,\n",
    "    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n",
    "    verbose=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
      "\u001b[32;1m\u001b[1;3mAction:\n",
      "```\n",
      "{\n",
      "  \"action\": \"eleven_labs_text2speech\",\n",
      "  \"action_input\": {\n",
      "    \"query\": \"Why did the chicken cross the playground? To get to the other slide!\"\n",
      "  }\n",
      "}\n",
      "```\n",
      "\n",
      "\u001b[0m\n",
      "Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n",
      "Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n",
      "Action:\n",
      "```\n",
      "{\n",
      "  \"action\": \"Final Answer\",\n",
      "  \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n",
      "}\n",
      "```\n",
      "\n",
      "\u001b[0m\n",
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "audio_file = agent.run(\"Tell me a joke and read it out for me.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f0aa7aa9-4682-4599-8cae-59347d9e5210",
   "metadata": {},
   "outputs": [],
   "source": [
    "tts.play(audio_file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/libs/langchain/langchain/agents/load_tools.py
+++ b/libs/langchain/langchain/agents/load_tools.py
@ -32,6 +32,7 @@ from langchain.tools.requests.tool import (
    RequestsPostTool,
    RequestsPutTool,
 )
 from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool
 from langchain.tools.scenexplain.tool import SceneXplainTool
 from langchain.tools.searx_search.tool import SearxSearchResults, SearxSearchRun
 from langchain.tools.shell.tool import ShellTool
@ -285,6 +286,10 @@ def _get_dataforseo_api_search_json(**kwargs: Any) -> BaseTool:
    return DataForSeoAPISearchResults(api_wrapper=DataForSeoAPIWrapper(**kwargs))
 def _get_eleven_labs_text2speech(**kwargs: Any) -> BaseTool:
    return ElevenLabsText2SpeechTool(**kwargs)
 _EXTRA_LLM_TOOLS: Dict[
    str,
    Tuple[Callable[[Arg(BaseLanguageModel, "llm"), KwArg(Any)], BaseTool], List[str]],
@ -340,6 +345,7 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st
        _get_dataforseo_api_search_json,
        ["api_login", "api_password", "aiosession"],
    ),
    "eleven_labs_text2speech": (_get_eleven_labs_text2speech, ["eleven_api_key"]),
 }
--- a/libs/langchain/langchain/tools/init.py
+++ b/libs/langchain/langchain/tools/init.py
@ -44,6 +44,7 @@ from langchain.tools.edenai import (
    EdenAiTextToSpeechTool,
    EdenaiTool,
 )
 from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool
 from langchain.tools.file_management import (
    CopyFileTool,
    DeleteFileTool,
@ -167,6 +168,7 @@ __all__ = [
    "EdenAiSpeechToTextTool",
    "EdenAiTextModerationTool",
    "EdenaiTool",
    "ElevenLabsText2SpeechTool",
    "ExtractHyperlinksTool",
    "ExtractTextTool",
    "FileSearchTool",
--- a/libs/langchain/langchain/tools/eleven_labs/init.py
+++ b/libs/langchain/langchain/tools/eleven_labs/init.py
@ -0,0 +1,5 @@
 """Eleven Labs Services Tools."""
 from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool
 __all__ = ["ElevenLabsText2SpeechTool"]
--- a/libs/langchain/langchain/tools/eleven_labs/models.py
+++ b/libs/langchain/langchain/tools/eleven_labs/models.py
@ -0,0 +1,8 @@
 from enum import Enum
 class ElevenLabsModel(str, Enum):
    """Models available for Eleven Labs Text2Speech."""
    MULTI_LINGUAL = "eleven_multilingual_v1"
    MONO_LINGUAL = "eleven_monolingual_v1"
--- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py
+++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py
@ -0,0 +1,80 @@
 import tempfile
 from enum import Enum
 from typing import Any, Dict, Optional, Union
 from langchain.callbacks.manager import CallbackManagerForToolRun
 from langchain.pydantic_v1 import root_validator
 from langchain.tools.base import BaseTool
 from langchain.utils import get_from_dict_or_env
 def _import_elevenlabs() -> Any:
    try:
        import elevenlabs
    except ImportError as e:
        raise ImportError(
            "Cannot import elevenlabs, please install `pip install elevenlabs`."
        ) from e
    return elevenlabs
 class ElevenLabsModel(str, Enum):
    """Models available for Eleven Labs Text2Speech."""
    MULTI_LINGUAL = "eleven_multilingual_v1"
    MONO_LINGUAL = "eleven_monolingual_v1"
 class ElevenLabsText2SpeechTool(BaseTool):
    """Tool that queries the Eleven Labs Text2Speech API.
    In order to set this up, follow instructions at:
    https://docs.elevenlabs.io/welcome/introduction
    """
    model: Union[ElevenLabsModel, str] = ElevenLabsModel.MULTI_LINGUAL
    name: str = "eleven_labs_text2speech"
    description: str = (
        "A wrapper around Eleven Labs Text2Speech. "
        "Useful for when you need to convert text to speech. "
        "It supports multiple languages, including English, German, Polish, "
        "Spanish, Italian, French, Portuguese, and Hindi. "
    )
    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that api key exists in environment."""
        _ = get_from_dict_or_env(values, "eleven_api_key", "ELEVEN_API_KEY")
        return values
    def _run(
        self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        """Use the tool."""
        elevenlabs = _import_elevenlabs()
        try:
            speech = elevenlabs.generate(text=query, model=self.model)
            with tempfile.NamedTemporaryFile(
                mode="bx", suffix=".wav", delete=False
            ) as f:
                f.write(speech)
            return f.name
        except Exception as e:
            raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}")
    def play(self, speech_file: str) -> None:
        """Play the text as speech."""
        elevenlabs = _import_elevenlabs()
        with open(speech_file, mode="rb") as f:
            speech = f.read()
        elevenlabs.play(speech)
    def stream_speech(self, query: str) -> None:
        """Stream the text as speech as it is generated.
        Play the text in your speakers."""
        elevenlabs = _import_elevenlabs()
        speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True)
        elevenlabs.stream(speech_stream)
--- a/libs/langchain/tests/unit_tests/tools/test_public_api.py
+++ b/libs/langchain/tests/unit_tests/tools/test_public_api.py
@ -36,6 +36,7 @@ _EXPECTED = [
    "EdenAiTextModerationTool",
    "EdenAiTextToSpeechTool",
    "EdenaiTool",
    "ElevenLabsText2SpeechTool",
    "ExtractHyperlinksTool",
    "ExtractTextTool",
    "FileSearchTool",