From 10e0431e48fe9e987c821a271392961100e1ce02 Mon Sep 17 00:00:00 2001
From: Massimiliano Pronesti <massimiliano.pronesti@gmail.com>
Date: Mon, 4 Sep 2023 09:24:13 +0200
Subject: [PATCH] feat(llms): add model_kwargs to hf tgi (#10139)

@baskaryan
Following what we discussed in #9724 and your suggestion, I've added a
`model_kwargs` parameter to hf tgi.
---
 .../llms/huggingface_text_gen_inference.py    | 122 +++++++++++-------
 1 file changed, 76 insertions(+), 46 deletions(-)

diff --git a/libs/langchain/langchain/llms/huggingface_text_gen_inference.py b/libs/langchain/langchain/llms/huggingface_text_gen_inference.py
index 284890579b..6545078f16 100644
--- a/libs/langchain/langchain/llms/huggingface_text_gen_inference.py
+++ b/libs/langchain/langchain/llms/huggingface_text_gen_inference.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
 
 from langchain.callbacks.manager import (
@@ -7,89 +8,91 @@ from langchain.callbacks.manager import (
 from langchain.llms.base import LLM
 from langchain.pydantic_v1 import Extra, Field, root_validator
 from langchain.schema.output import GenerationChunk
+from langchain.utils import get_pydantic_field_names
+
+logger = logging.getLogger(__name__)
 
 
 class HuggingFaceTextGenInference(LLM):
     """
     HuggingFace text generation API.
 
-    It generates text from a given prompt.
+    To use, you should have the `text-generation` python package installed and
+    a text-generation server running.
 
-    Attributes:
-    - max_new_tokens: The maximum number of tokens to generate.
-    - top_k: The number of top-k tokens to consider when generating text.
-    - top_p: The cumulative probability threshold for generating text.
-    - typical_p: The typical probability threshold for generating text.
-    - temperature: The temperature to use when generating text.
-    - repetition_penalty: The repetition penalty to use when generating text.
-    - truncate: truncate inputs tokens to the given size
-    - stop_sequences: A list of stop sequences to use when generating text.
-    - seed: The seed to use when generating text.
-    - inference_server_url: The URL of the inference server to use.
-    - timeout: The timeout value in seconds to use while connecting to inference server.
-    - server_kwargs: The keyword arguments to pass to the inference server.
-    - client: The client object used to communicate with the inference server.
-    - async_client: The async client object used to communicate with the server.
-
-    Methods:
-    - _call: Generates text based on a given prompt and stop sequences.
-    - _acall: Async generates text based on a given prompt and stop sequences.
-    - _llm_type: Returns the type of LLM.
-    - _default_params: Returns the default parameters for calling text generation
-     inference API.
-    """
-
-    """
     Example:
         .. code-block:: python
 
             # Basic Example (no streaming)
             llm = HuggingFaceTextGenInference(
-                inference_server_url = "http://localhost:8010/",
-                max_new_tokens = 512,
-                top_k = 10,
-                top_p = 0.95,
-                typical_p = 0.95,
-                temperature = 0.01,
-                repetition_penalty = 1.03,
+                inference_server_url="http://localhost:8010/",
+                max_new_tokens=512,
+                top_k=10,
+                top_p=0.95,
+                typical_p=0.95,
+                temperature=0.01,
+                repetition_penalty=1.03,
             )
             print(llm("What is Deep Learning?"))
-            
+
             # Streaming response example
             from langchain.callbacks import streaming_stdout
-            
+
             callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]
             llm = HuggingFaceTextGenInference(
-                inference_server_url = "http://localhost:8010/",
-                max_new_tokens = 512,
-                top_k = 10,
-                top_p = 0.95,
-                typical_p = 0.95,
-                temperature = 0.01,
-                repetition_penalty = 1.03,
-                callbacks = callbacks,
-                streaming = True
+                inference_server_url="http://localhost:8010/",
+                max_new_tokens=512,
+                top_k=10,
+                top_p=0.95,
+                typical_p=0.95,
+                temperature=0.01,
+                repetition_penalty=1.03,
+                callbacks=callbacks,
+                streaming=True
             )
             print(llm("What is Deep Learning?"))
-            
+
     """
 
     max_new_tokens: int = 512
+    """Maximum number of generated tokens"""
     top_k: Optional[int] = None
+    """The number of highest probability vocabulary tokens to keep for
+    top-k-filtering."""
     top_p: Optional[float] = 0.95
+    """If set to < 1, only the smallest set of most probable tokens with probabilities
+    that add up to `top_p` or higher are kept for generation."""
     typical_p: Optional[float] = 0.95
+    """Typical Decoding mass. See [Typical Decoding for Natural Language
+    Generation](https://arxiv.org/abs/2202.00666) for more information."""
     temperature: float = 0.8
+    """The value used to module the logits distribution."""
     repetition_penalty: Optional[float] = None
+    """The parameter for repetition penalty. 1.0 means no penalty.
+    See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details."""
     return_full_text: bool = False
+    """Whether to prepend the prompt to the generated text"""
     truncate: Optional[int] = None
+    """Truncate inputs tokens to the given size"""
     stop_sequences: List[str] = Field(default_factory=list)
+    """Stop generating tokens if a member of `stop_sequences` is generated"""
     seed: Optional[int] = None
+    """Random sampling seed"""
     inference_server_url: str = ""
+    """text-generation-inference instance base url"""
     timeout: int = 120
-    server_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Timeout in seconds"""
     streaming: bool = False
+    """Whether to generate a stream of tokens asynchronously"""
     do_sample: bool = False
+    """Activate logits sampling"""
     watermark: bool = False
+    """Watermarking with [A Watermark for Large Language Models]
+    (https://arxiv.org/abs/2301.10226)"""
+    server_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Holds any text-generation-inference server parameters not explicitly specified"""
+    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Holds any model parameters valid for `call` not explicitly specified"""
     client: Any
     async_client: Any
 
@@ -98,6 +101,32 @@ class HuggingFaceTextGenInference(LLM):
 
         extra = Extra.forbid
 
+    @root_validator(pre=True)
+    def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Build extra kwargs from additional params that were passed in."""
+        all_required_field_names = get_pydantic_field_names(cls)
+        extra = values.get("model_kwargs", {})
+        for field_name in list(values):
+            if field_name in extra:
+                raise ValueError(f"Found {field_name} supplied twice.")
+            if field_name not in all_required_field_names:
+                logger.warning(
+                    f"""WARNING! {field_name} is not default parameter.
+                    {field_name} was transferred to model_kwargs.
+                    Please confirm that {field_name} is what you intended."""
+                )
+                extra[field_name] = values.pop(field_name)
+
+        invalid_model_kwargs = all_required_field_names.intersection(extra.keys())
+        if invalid_model_kwargs:
+            raise ValueError(
+                f"Parameters {invalid_model_kwargs} should be specified explicitly. "
+                f"Instead they were passed in as part of `model_kwargs` parameter."
+            )
+
+        values["model_kwargs"] = extra
+        return values
+
     @root_validator()
     def validate_environment(cls, values: Dict) -> Dict:
         """Validate that python package exists in environment."""
@@ -143,6 +172,7 @@ class HuggingFaceTextGenInference(LLM):
             "seed": self.seed,
             "do_sample": self.do_sample,
             "watermark": self.watermark,
+            **self.model_kwargs,
         }
 
     def _invocation_params(