diff --git a/docs/extras/integrations/tools/eleven_labs_tts.ipynb b/docs/extras/integrations/tools/eleven_labs_tts.ipynb new file mode 100644 index 0000000000..093679c8d1 --- /dev/null +++ b/docs/extras/integrations/tools/eleven_labs_tts.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a991a6f8-1897-4f49-a191-ae3bdaeda856", + "metadata": {}, + "source": [ + "# Eleven Labs Text2Speech\n", + "\n", + "This notebook shows how to interact with the `ElevenLabs API` to achieve text-to-speech capabilities." + ] + }, + { + "cell_type": "markdown", + "id": "9eeb311e-e1bd-4959-8536-4d267f302eb3", + "metadata": {}, + "source": [ + "First, you need to set up an ElevenLabs account. You can follow the instructions [here](https://docs.elevenlabs.io/welcome/introduction)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0a309c0e-5310-4eaa-8af9-bcbc252e45da", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install elevenlabs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f097c3b1-f761-43cb-aad0-8ba2e93e5f5f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"ELEVEN_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "434b2454-2bff-484d-822c-4026a9dc1383", + "metadata": {}, + "source": [ + "## Usage" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2f57a647-9214-4562-a8cf-f263a15d1f40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'eleven_labs_text2speech'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.tools import ElevenLabsText2SpeechTool\n", + "\n", + "text_to_speak = \"Hello world! I am the real slim shady\"\n", + "\n", + "tts = ElevenLabsText2SpeechTool()\n", + "tts.name" + ] + }, + { + "cell_type": "markdown", + "id": "d4613fed-66f0-47c6-be50-7e7670654427", + "metadata": {}, + "source": [ + "We can generate audio, save it to the temporary file and then play it." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f1984844-aa75-4f83-9d42-1c8052d87cc0", + "metadata": {}, + "outputs": [], + "source": [ + "speech_file = tts.run(text_to_speak)\n", + "tts.play(speech_file)" + ] + }, + { + "cell_type": "markdown", + "id": "42d89cd4-ac2a-4857-9787-c9018b4a8782", + "metadata": {}, + "source": [ + "Or stream audio directly." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645", + "metadata": {}, + "outputs": [], + "source": [ + "tts.stream_speech(text_to_speak)" + ] + }, + { + "cell_type": "markdown", + "id": "a152766d-5f06-48b1-ac89-b4e8d88d3c9f", + "metadata": {}, + "source": [ + "## Use within an Agent" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain import OpenAI\n", + "from langchain.agents import initialize_agent, AgentType, load_tools" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0)\n", + "tools = load_tools([\"eleven_labs_text2speech\"])\n", + "agent = initialize_agent(\n", + " tools=tools,\n", + " llm=llm,\n", + " agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3mAction:\n", + "```\n", + "{\n", + " \"action\": \"eleven_labs_text2speech\",\n", + " \"action_input\": {\n", + " \"query\": \"Why did the chicken cross the playground? To get to the other slide!\"\n", + " }\n", + "}\n", + "```\n", + "\n", + "\u001b[0m\n", + "Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n", + "Action:\n", + "```\n", + "{\n", + " \"action\": \"Final Answer\",\n", + " \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n", + "}\n", + "```\n", + "\n", + "\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], + "source": [ + "audio_file = agent.run(\"Tell me a joke and read it out for me.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f0aa7aa9-4682-4599-8cae-59347d9e5210", + "metadata": {}, + "outputs": [], + "source": [ + "tts.play(audio_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/agents/load_tools.py b/libs/langchain/langchain/agents/load_tools.py index 8fc93e45ae..28a3b48350 100644 --- a/libs/langchain/langchain/agents/load_tools.py +++ b/libs/langchain/langchain/agents/load_tools.py @@ -32,6 +32,7 @@ from langchain.tools.requests.tool import ( RequestsPostTool, RequestsPutTool, ) +from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool from langchain.tools.scenexplain.tool import SceneXplainTool from langchain.tools.searx_search.tool import SearxSearchResults, SearxSearchRun from langchain.tools.shell.tool import ShellTool @@ -285,6 +286,10 @@ def _get_dataforseo_api_search_json(**kwargs: Any) -> BaseTool: return DataForSeoAPISearchResults(api_wrapper=DataForSeoAPIWrapper(**kwargs)) +def _get_eleven_labs_text2speech(**kwargs: Any) -> BaseTool: + return ElevenLabsText2SpeechTool(**kwargs) + + _EXTRA_LLM_TOOLS: Dict[ str, Tuple[Callable[[Arg(BaseLanguageModel, "llm"), KwArg(Any)], BaseTool], List[str]], @@ -340,6 +345,7 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st _get_dataforseo_api_search_json, ["api_login", "api_password", "aiosession"], ), + "eleven_labs_text2speech": (_get_eleven_labs_text2speech, ["eleven_api_key"]), } diff --git a/libs/langchain/langchain/tools/__init__.py b/libs/langchain/langchain/tools/__init__.py index 56958d90b2..5674929f3d 100644 --- a/libs/langchain/langchain/tools/__init__.py +++ b/libs/langchain/langchain/tools/__init__.py @@ -44,6 +44,7 @@ from langchain.tools.edenai import ( EdenAiTextToSpeechTool, EdenaiTool, ) +from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool from langchain.tools.file_management import ( CopyFileTool, DeleteFileTool, @@ -167,6 +168,7 @@ __all__ = [ "EdenAiSpeechToTextTool", "EdenAiTextModerationTool", "EdenaiTool", + "ElevenLabsText2SpeechTool", "ExtractHyperlinksTool", "ExtractTextTool", "FileSearchTool", diff --git a/libs/langchain/langchain/tools/eleven_labs/__init__.py b/libs/langchain/langchain/tools/eleven_labs/__init__.py new file mode 100644 index 0000000000..86ccba0804 --- /dev/null +++ b/libs/langchain/langchain/tools/eleven_labs/__init__.py @@ -0,0 +1,5 @@ +"""Eleven Labs Services Tools.""" + +from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool + +__all__ = ["ElevenLabsText2SpeechTool"] diff --git a/libs/langchain/langchain/tools/eleven_labs/models.py b/libs/langchain/langchain/tools/eleven_labs/models.py new file mode 100644 index 0000000000..c977b2972f --- /dev/null +++ b/libs/langchain/langchain/tools/eleven_labs/models.py @@ -0,0 +1,8 @@ +from enum import Enum + + +class ElevenLabsModel(str, Enum): + """Models available for Eleven Labs Text2Speech.""" + + MULTI_LINGUAL = "eleven_multilingual_v1" + MONO_LINGUAL = "eleven_monolingual_v1" diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py new file mode 100644 index 0000000000..216fb8143d --- /dev/null +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -0,0 +1,71 @@ +import tempfile +from typing import TYPE_CHECKING, Dict, Optional, Union + +from langchain.callbacks.manager import CallbackManagerForToolRun +from langchain.pydantic_v1 import root_validator +from langchain.tools.base import BaseTool +from langchain.tools.eleven_labs.models import ElevenLabsModel +from langchain.utils import get_from_dict_or_env + +if TYPE_CHECKING: + try: + import elevenlabs + + except ImportError: + raise ImportError( + "elevenlabs is not installed. " "Run `pip install elevenlabs` to install." + ) + + +class ElevenLabsText2SpeechTool(BaseTool): + """Tool that queries the Eleven Labs Text2Speech API. + + In order to set this up, follow instructions at: + https://docs.elevenlabs.io/welcome/introduction + """ + + model: Union[ElevenLabsModel, str] = ElevenLabsModel.MULTI_LINGUAL + + name: str = "eleven_labs_text2speech" + description: str = ( + "A wrapper around Eleven Labs Text2Speech. " + "Useful for when you need to convert text to speech. " + "It supports multiple languages, including English, German, Polish, " + "Spanish, Italian, French, Portuguese, and Hindi. " + ) + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key exists in environment.""" + _ = get_from_dict_or_env(values, "eleven_api_key", "ELEVEN_API_KEY") + + return values + + def _text2speech(self, text: str) -> str: + speech = elevenlabs.generate(text=text, model=self.model) + with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: + f.write(speech) + return f.name + + def _run( + self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None + ) -> str: + """Use the tool.""" + try: + speech_file = self._text2speech(query) + return speech_file + except Exception as e: + raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}") + + def play(self, speech_file: str) -> None: + """Play the text as speech.""" + with open(speech_file, mode="rb") as f: + speech = f.read() + + elevenlabs.play(speech) + + def stream_speech(self, query: str) -> None: + """Stream the text as speech as it is generated. + Play the text in your speakers.""" + speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True) + elevenlabs.stream(speech_stream) diff --git a/libs/langchain/tests/unit_tests/tools/test_public_api.py b/libs/langchain/tests/unit_tests/tools/test_public_api.py index e7fd784587..d0c310837d 100644 --- a/libs/langchain/tests/unit_tests/tools/test_public_api.py +++ b/libs/langchain/tests/unit_tests/tools/test_public_api.py @@ -36,6 +36,7 @@ _EXPECTED = [ "EdenAiTextModerationTool", "EdenAiTextToSpeechTool", "EdenaiTool", + "ElevenLabsText2SpeechTool", "ExtractHyperlinksTool", "ExtractTextTool", "FileSearchTool",