From f7e8a382ccb25441c5eea624e62a18533c3524ee Mon Sep 17 00:00:00 2001 From: Robby <45851384+h0rv@users.noreply.github.com> Date: Fri, 29 Mar 2024 11:02:29 -0400 Subject: [PATCH] community[minor]: add hugging face text-to-speech inference API (#18880) Description: I implemented a tool to use Hugging Face text-to-speech inference API. Issue: n/a Dependencies: n/a Twitter handle: No Twitter, but do have [LinkedIn](https://www.linkedin.com/in/robby-horvath/) lol. --------- Co-authored-by: Robby Co-authored-by: Eugene Yurtsev --- .../tools/audio/__init__.py | 7 ++ .../huggingface_text_to_speech_inference.py | 118 ++++++++++++++++++ .../tests/unit_tests/tools/audio/__init__.py | 0 .../unit_tests/tools/audio/test_tools.py | 87 +++++++++++++ 4 files changed, 212 insertions(+) create mode 100644 libs/community/langchain_community/tools/audio/__init__.py create mode 100644 libs/community/langchain_community/tools/audio/huggingface_text_to_speech_inference.py create mode 100644 libs/community/tests/unit_tests/tools/audio/__init__.py create mode 100644 libs/community/tests/unit_tests/tools/audio/test_tools.py diff --git a/libs/community/langchain_community/tools/audio/__init__.py b/libs/community/langchain_community/tools/audio/__init__.py new file mode 100644 index 0000000000..9024dc6fea --- /dev/null +++ b/libs/community/langchain_community/tools/audio/__init__.py @@ -0,0 +1,7 @@ +from langchain_community.tools.audio.huggingface_text_to_speech_inference import ( + HuggingFaceTextToSpeechModelInference, +) + +__all__ = [ + "HuggingFaceTextToSpeechModelInference", +] diff --git a/libs/community/langchain_community/tools/audio/huggingface_text_to_speech_inference.py b/libs/community/langchain_community/tools/audio/huggingface_text_to_speech_inference.py new file mode 100644 index 0000000000..c728f8ad22 --- /dev/null +++ b/libs/community/langchain_community/tools/audio/huggingface_text_to_speech_inference.py @@ -0,0 +1,118 @@ +import logging +import os +import uuid +from datetime import datetime +from typing import Callable, Literal, Optional + +import requests +from langchain_core.callbacks import CallbackManagerForToolRun +from langchain_core.pydantic_v1 import SecretStr +from langchain_core.tools import BaseTool + +logger = logging.getLogger(__name__) + + +class HuggingFaceTextToSpeechModelInference(BaseTool): + """HuggingFace Text-to-Speech Model Inference. + + Requirements: + - Environment variable ``HUGGINGFACE_API_KEY`` must be set, + or passed as a named parameter to the constructor. + """ + + name: str = "openai_text_to_speech" + """Name of the tool.""" + description: str = "A wrapper around OpenAI Text-to-Speech API. " + """Description of the tool.""" + + model: str + """Model name.""" + file_extension: str + """File extension of the output audio file.""" + destination_dir: str + """Directory to save the output audio file.""" + file_namer: Callable[[], str] + """Function to generate unique file names.""" + + api_url: str + huggingface_api_key: SecretStr + + _HUGGINGFACE_API_KEY_ENV_NAME = "HUGGINGFACE_API_KEY" + _HUGGINGFACE_API_URL_ROOT = "https://api-inference.huggingface.co/models" + + def __init__( + self, + model: str, + file_extension: str, + *, + destination_dir: str = "./tts", + file_naming_func: Literal["uuid", "timestamp"] = "uuid", + huggingface_api_key: Optional[SecretStr] = None, + ) -> None: + if not huggingface_api_key: + huggingface_api_key = SecretStr( + os.getenv(self._HUGGINGFACE_API_KEY_ENV_NAME, "") + ) + + if ( + not huggingface_api_key + or not huggingface_api_key.get_secret_value() + or huggingface_api_key.get_secret_value() == "" + ): + raise ValueError( + f"'{self._HUGGINGFACE_API_KEY_ENV_NAME}' must be or set or passed" + ) + + if file_naming_func == "uuid": + file_namer = lambda: str(uuid.uuid4()) # noqa: E731 + elif file_naming_func == "timestamp": + file_namer = lambda: str(int(datetime.now().timestamp())) # noqa: E731 + else: + raise ValueError( + f"Invalid value for 'file_naming_func': {file_naming_func}" + ) + + super().__init__( + model=model, + file_extension=file_extension, + api_url=f"{self._HUGGINGFACE_API_URL_ROOT}/{model}", + destination_dir=destination_dir, + file_namer=file_namer, + huggingface_api_key=huggingface_api_key, + ) + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + response = requests.post( + self.api_url, + headers={ + "Authorization": f"Bearer {self.huggingface_api_key.get_secret_value()}" + }, + json={"inputs": query}, + ) + audio_bytes = response.content + + try: + os.makedirs(self.destination_dir, exist_ok=True) + except Exception as e: + logger.error(f"Error creating directory '{self.destination_dir}': {e}") + raise + + output_file = os.path.join( + self.destination_dir, + f"{str(self.file_namer())}.{self.file_extension}", + ) + + try: + with open(output_file, mode="xb") as f: + f.write(audio_bytes) + except FileExistsError: + raise ValueError("Output name must be unique") + except Exception as e: + logger.error(f"Error occurred while creating file: {e}") + raise + + return output_file diff --git a/libs/community/tests/unit_tests/tools/audio/__init__.py b/libs/community/tests/unit_tests/tools/audio/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/community/tests/unit_tests/tools/audio/test_tools.py b/libs/community/tests/unit_tests/tools/audio/test_tools.py new file mode 100644 index 0000000000..03055ad36c --- /dev/null +++ b/libs/community/tests/unit_tests/tools/audio/test_tools.py @@ -0,0 +1,87 @@ +"""Test Audio Tools.""" + +import os +import tempfile +import uuid +from unittest.mock import Mock, mock_open, patch + +import pytest +from langchain_core.pydantic_v1 import SecretStr + +from langchain_community.tools.audio import HuggingFaceTextToSpeechModelInference + +AUDIO_FORMAT_EXT = "wav" + + +def test_huggingface_tts_constructor() -> None: + with pytest.raises(ValueError): + os.environ.pop("HUGGINGFACE_API_KEY", None) + HuggingFaceTextToSpeechModelInference( + model="test/model", + file_extension=AUDIO_FORMAT_EXT, + ) + + with pytest.raises(ValueError): + HuggingFaceTextToSpeechModelInference( + model="test/model", + file_extension=AUDIO_FORMAT_EXT, + huggingface_api_key=SecretStr(""), + ) + + HuggingFaceTextToSpeechModelInference( + model="test/model", + file_extension=AUDIO_FORMAT_EXT, + huggingface_api_key=SecretStr("foo"), + ) + + os.environ["HUGGINGFACE_API_KEY"] = "foo" + HuggingFaceTextToSpeechModelInference( + model="test/model", + file_extension=AUDIO_FORMAT_EXT, + ) + + +def test_huggingface_tts_run_with_requests_mock() -> None: + os.environ["HUGGINGFACE_API_KEY"] = "foo" + + with tempfile.TemporaryDirectory() as tmp_dir, patch( + "uuid.uuid4" + ) as mock_uuid, patch("requests.post") as mock_inference, patch( + "builtins.open", mock_open() + ) as mock_file: + input_query = "Dummy input" + + mock_uuid_value = uuid.UUID("00000000-0000-0000-0000-000000000000") + mock_uuid.return_value = mock_uuid_value + + expected_output_file_base_name = os.path.join(tmp_dir, str(mock_uuid_value)) + expected_output_file = f"{expected_output_file_base_name}.{AUDIO_FORMAT_EXT}" + + test_audio_content = b"test_audio_bytes" + + tts = HuggingFaceTextToSpeechModelInference( + model="test/model", + file_extension=AUDIO_FORMAT_EXT, + destination_dir=tmp_dir, + file_naming_func="uuid", + ) + + # Mock the requests.post response + mock_response = Mock() + mock_response.content = test_audio_content + mock_inference.return_value = mock_response + + output_path = tts._run(input_query) + + assert output_path == expected_output_file + + mock_inference.assert_called_once_with( + tts.api_url, + headers={ + "Authorization": f"Bearer {tts.huggingface_api_key.get_secret_value()}" + }, + json={"inputs": input_query}, + ) + + mock_file.assert_called_once_with(expected_output_file, mode="xb") + mock_file.return_value.write.assert_called_once_with(test_audio_content)