Takeoff pro support (#12070)

**Description:** This PR adds support for the [Pro version of Titan Takeoff Server](https://docs.titanml.co/docs/category/pro-features). Users of the Pro version will have to import the TitanTakeoffPro model, which is different from TitanTakeoff. **Issue:** Also minor fixes to docs for Titan Takeoff (Community version) **Dependencies:** No additional dependencies **Twitter handle:** @becoming_blake @baskaryan @hwchase17
12 months ago · b9410f2b6f
parent 4e47fe1dce
commit b9410f2b6f
5 changed files with 344 additions and 1 deletions
--- a/docs/docs/integrations/llms/titan_takeoff.ipynb
+++ b/docs/docs/integrations/llms/titan_takeoff.ipynb
@ -108,7 +108,7 @@
    "from langchain.llms import TitanTakeoff\n",
    "\n",
    "llm = TitanTakeoff(\n",
-    "    baseURL=\"http://localhost:8000\",\n",
+    "    base_url=\"http://localhost:8000\",\n",
    "    generate_max_length=128,\n",
    "    temperature=1.0\n",
    ")\n",
--- a/docs/docs/integrations/llms/titan_takeoff_pro.ipynb
+++ b/docs/docs/integrations/llms/titan_takeoff_pro.ipynb
@ -0,0 +1,100 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Titan Takeoff Pro\n",
    "\n",
    "`TitanML` helps businesses build and deploy better, smaller, cheaper, and faster NLP models through our training, compression, and inference optimization platform.\n",
    "\n",
    ">Note: These docs are for the Pro version of Titan Takeoff. For the community version, see the page for Titan Takeoff.\n",
    "\n",
    "Our inference server, [Titan Takeoff (Pro Version)](https://docs.titanml.co/docs/titan-takeoff/pro-features/feature-comparison) enables deployment of LLMs locally on your hardware in a single command. Most generative model architectures are supported, such as Falcon, Llama 2, GPT2, T5 and many more."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example usage\n",
    "Here are some helpful examples to get started using the Pro version of Titan Takeoff Server.\n",
    "No parameters are needed by default, but a baseURL that points to your desired URL where Takeoff is running can be specified and generation parameters can be supplied."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.llms import TitanTakeoffPro\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
    "from langchain.callbacks.manager import CallbackManager\n",
    "\n",
    "# Example 1: Basic use\n",
    "llm = TitanTakeoffPro()\n",
    "output = llm(\"What is the weather in London in August?\")\n",
    "print(output)\n",
    "\n",
    "\n",
    "# Example 2: Specifying a port and other generation parameters\n",
    "llm = TitanTakeoffPro(\n",
    "    base_url=\"http://localhost:3000\",\n",
    "    min_new_tokens=128,\n",
    "    max_new_tokens=512,\n",
    "    no_repeat_ngram_size=2,\n",
    "    sampling_topk= 1,\n",
    "    sampling_topp= 1.0,\n",
    "    sampling_temperature= 1.0,\n",
    "    repetition_penalty= 1.0,\n",
    "    regex_string= \"\",\n",
    ")\n",
    "output = llm(\"What is the largest rainforest in the world?\")\n",
    "print(output)\n",
    "\n",
    "\n",
    "# Example 3: Using generate for multiple inputs\n",
    "llm = TitanTakeoffPro()\n",
    "rich_output = llm.generate([\"What is Deep Learning?\", \"What is Machine Learning?\"])\n",
    "print(rich_output.generations)\n",
    "\n",
    "\n",
    "# Example 4: Streaming output\n",
    "llm = TitanTakeoffPro(streaming=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))\n",
    "prompt = \"What is the capital of France?\"\n",
    "llm(prompt)\n",
    "\n",
    "# Example 5: Using LCEL\n",
    "llm = TitanTakeoffPro()\n",
    "prompt = PromptTemplate.from_template(\"Tell me about {topic}\")\n",
    "chain = prompt | llm\n",
    "chain.invoke({\"topic\": \"the universe\"})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/libs/langchain/langchain/llms/init.py
+++ b/libs/langchain/langchain/llms/init.py
@ -444,6 +444,12 @@ def _import_titan_takeoff() -> Any:
    return TitanTakeoff
 def _import_titan_takeoff_pro() -> Any:
    from langchain.llms.titan_takeoff_pro import TitanTakeoffPro
    return TitanTakeoffPro
 def _import_together() -> Any:
    from langchain.llms.together import Together
@ -639,6 +645,8 @@ def __getattr__(name: str) -> Any:
        return _import_textgen()
    elif name == "TitanTakeoff":
        return _import_titan_takeoff()
    elif name == "TitanTakeoffPro":
        return _import_titan_takeoff_pro()
    elif name == "Together":
        return _import_together()
    elif name == "Tongyi":
@ -735,6 +743,7 @@ __all__ = [
    "SelfHostedPipeline",
    "StochasticAI",
    "TitanTakeoff",
    "TitanTakeoffPro",
    "Tongyi",
    "VertexAI",
    "VertexAIModelGarden",
@ -813,6 +822,7 @@ def get_type_to_cls_dict() -> Dict[str, Callable[[], Type[BaseLLM]]]:
        "together": _import_together,
        "tongyi": _import_tongyi,
        "titan_takeoff": _import_titan_takeoff,
        "titan_takeoff_pro": _import_titan_takeoff_pro,
        "vertexai": _import_vertex,
        "vertexai_model_garden": _import_vertex_model_garden,
        "openllm": _import_openllm,
--- a/libs/langchain/langchain/llms/titan_takeoff_pro.py
+++ b/libs/langchain/langchain/llms/titan_takeoff_pro.py
@ -0,0 +1,215 @@
 from typing import Any, Iterator, List, Mapping, Optional
 import requests
 from requests.exceptions import ConnectionError
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms.base import LLM
 from langchain.llms.utils import enforce_stop_tokens
 from langchain.schema.output import GenerationChunk
 class TitanTakeoffPro(LLM):
    base_url: Optional[str] = "http://localhost:3000"
    """Specifies the baseURL to use for the Titan Takeoff Pro API.
    Default = http://localhost:3000.
    """
    max_new_tokens: Optional[int] = None
    """Maximum tokens generated."""
    min_new_tokens: Optional[int] = None
    """Minimum tokens generated."""
    sampling_topk: Optional[int] = None
    """Sample predictions from the top K most probable candidates."""
    sampling_topp: Optional[float] = None
    """Sample from predictions whose cumulative probability exceeds this value.
    """
    sampling_temperature: Optional[float] = None
    """Sample with randomness. Bigger temperatures are associated with 
    more randomness and 'creativity'.
    """
    repetition_penalty: Optional[float] = None
    """Penalise the generation of tokens that have been generated before. 
    Set to > 1 to penalize.
    """
    regex_string: Optional[str] = None
    """A regex string for constrained generation."""
    no_repeat_ngram_size: Optional[int] = None
    """Prevent repetitions of ngrams of this size. Default = 0 (turned off)."""
    streaming: bool = False
    """Whether to stream the output. Default = False."""
    @property
    def _default_params(self) -> Mapping[str, Any]:
        """Get the default parameters for calling Titan Takeoff Server (Pro)."""
        return {
            **(
                {"regex_string": self.regex_string}
                if self.regex_string is not None
                else {}
            ),
            **(
                {"sampling_temperature": self.sampling_temperature}
                if self.sampling_temperature is not None
                else {}
            ),
            **(
                {"sampling_topp": self.sampling_topp}
                if self.sampling_topp is not None
                else {}
            ),
            **(
                {"repetition_penalty": self.repetition_penalty}
                if self.repetition_penalty is not None
                else {}
            ),
            **(
                {"max_new_tokens": self.max_new_tokens}
                if self.max_new_tokens is not None
                else {}
            ),
            **(
                {"min_new_tokens": self.min_new_tokens}
                if self.min_new_tokens is not None
                else {}
            ),
            **(
                {"sampling_topk": self.sampling_topk}
                if self.sampling_topk is not None
                else {}
            ),
            **(
                {"no_repeat_ngram_size": self.no_repeat_ngram_size}
                if self.no_repeat_ngram_size is not None
                else {}
            ),
        }
    @property
    def _llm_type(self) -> str:
        """Return type of llm."""
        return "titan_takeoff_pro"
    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        """Call out to Titan Takeoff (Pro) generate endpoint.
        Args:
            prompt: The prompt to pass into the model.
            stop: Optional list of stop words to use when generating.
        Returns:
            The string generated by the model.
        Example:
            .. code-block:: python
                prompt = "What is the capital of the United Kingdom?"
                response = model(prompt)
        """
        try:
            if self.streaming:
                text_output = ""
                for chunk in self._stream(
                    prompt=prompt,
                    stop=stop,
                    run_manager=run_manager,
                ):
                    text_output += chunk.text
                return text_output
            url = f"{self.base_url}/generate"
            params = {"text": prompt, **self._default_params}
            response = requests.post(url, json=params)
            response.raise_for_status()
            response.encoding = "utf-8"
            text = ""
            if "text" in response.json():
                text = response.json()["text"]
                text = text.replace("</s>", "")
            else:
                raise ValueError("Something went wrong.")
            if stop is not None:
                text = enforce_stop_tokens(text, stop)
            return text
        except ConnectionError:
            raise ConnectionError(
                "Could not connect to Titan Takeoff (Pro) server. \
                Please make sure that the server is running."
            )
    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        """Call out to Titan Takeoff (Pro) stream endpoint.
        Args:
            prompt: The prompt to pass into the model.
            stop: Optional list of stop words to use when generating.
        Returns:
            The string generated by the model.
        Yields:
            A dictionary like object containing a string token.
        Example:
            .. code-block:: python
                prompt = "What is the capital of the United Kingdom?"
                response = model(prompt)
        """
        url = f"{self.base_url}/generate_stream"
        params = {"text": prompt, **self._default_params}
        response = requests.post(url, json=params, stream=True)
        response.encoding = "utf-8"
        buffer = ""
        for text in response.iter_content(chunk_size=1, decode_unicode=True):
            buffer += text
            if "data:" in buffer:
                # Remove the first instance of "data:" from the buffer.
                if buffer.startswith("data:"):
                    buffer = ""
                if len(buffer.split("data:", 1)) == 2:
                    content, _ = buffer.split("data:", 1)
                    buffer = content.rstrip("\n")
                # Trim the buffer to only have content after the "data:" part.
                if buffer:  # Ensure that there's content to process.
                    chunk = GenerationChunk(text=buffer)
                    buffer = ""  # Reset buffer for the next set of data.
                    yield chunk
                    if run_manager:
                        run_manager.on_llm_new_token(token=chunk.text)
        # Yield any remaining content in the buffer.
        if buffer:
            chunk = GenerationChunk(text=buffer.replace("</s>", ""))
            yield chunk
            if run_manager:
                run_manager.on_llm_new_token(token=chunk.text)
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"base_url": self.base_url, **{}, **self._default_params}
--- a/libs/langchain/tests/integration_tests/llms/test_titan_takeoff_pro.py
+++ b/libs/langchain/tests/integration_tests/llms/test_titan_takeoff_pro.py
@ -0,0 +1,18 @@
 """Test Titan Takeoff wrapper."""
 import responses
 from langchain.llms.titan_takeoff_pro import TitanTakeoffPro
@responses.activate
 def test_titan_takeoff_pro_call() -> None:
    """Test valid call to Titan Takeoff."""
    url = "http://localhost:3000/generate"
    responses.add(responses.POST, url, json={"message": "2 + 2 is 4"}, status=200)
    # response = requests.post(url)
    llm = TitanTakeoffPro()
    output = llm("What is 2 + 2?")
    assert isinstance(output, str)