diff --git a/docs/extras/modules/model_io/models/llms/integrations/huggingface_textgen_inference.ipynb b/docs/extras/modules/model_io/models/llms/integrations/huggingface_textgen_inference.ipynb index 72c79cee7b..6aacfc8a31 100644 --- a/docs/extras/modules/model_io/models/llms/integrations/huggingface_textgen_inference.ipynb +++ b/docs/extras/modules/model_io/models/llms/integrations/huggingface_textgen_inference.ipynb @@ -48,6 +48,36 @@ ")\n", "llm(\"What did foo say about bar?\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.llms import HuggingFaceTextGenInference\n", + "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n", + "\n", + "\n", + "llm = HuggingFaceTextGenInference(\n", + " inference_server_url=\"http://localhost:8010/\",\n", + " max_new_tokens=512,\n", + " top_k=10,\n", + " top_p=0.95,\n", + " typical_p=0.95,\n", + " temperature=0.01,\n", + " repetition_penalty=1.03,\n", + " stream=True\n", + ")\n", + "llm(\"What did foo say about bar?\", callbacks=[StreamingStdOutCallbackHandler()])" + ] } ], "metadata": { diff --git a/docs/snippets/modules/model_io/models/llms/how_to/streaming_llm.mdx b/docs/snippets/modules/model_io/models/llms/how_to/streaming_llm.mdx index f15474a7e9..88240bd1c1 100644 --- a/docs/snippets/modules/model_io/models/llms/how_to/streaming_llm.mdx +++ b/docs/snippets/modules/model_io/models/llms/how_to/streaming_llm.mdx @@ -1,5 +1,4 @@ -Currently, we support streaming for the `OpenAI`, `ChatOpenAI`, and `ChatAnthropic` implementations. To utilize streaming, use a [`CallbackHandler`](https://github.com/hwchase17/langchain/blob/master/langchain/callbacks/base.py) that implements `on_llm_new_token`. In this example, we are using `StreamingStdOutCallbackHandler`. - +Currently, we support streaming for a broad range of LLM implementations, including but not limited to `OpenAI`, `ChatOpenAI`, `ChatAnthropic`, `Hugging Face Text Generation Inference`, and `Replicate`. This feature has been expanded to accommodate most of the models. To utilize streaming, use a [`CallbackHandler`](https://github.com/hwchase17/langchain/blob/master/langchain/callbacks/base.py) that implements `on_llm_new_token`. In this example, we are using `StreamingStdOutCallbackHandler`. ```python from langchain.llms import OpenAI from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler