From a7500ab0fbf6e9d00bcee9ada8bd5fb5d9e539c1 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Tue, 12 Mar 2024 00:00:31 +0400 Subject: [PATCH] docs: Update huggingface pipelines notebook (#18801) --- .../llms/huggingface_pipelines.ipynb | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb index 4c07c06e03..6f48849b57 100644 --- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb +++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb @@ -259,6 +259,26 @@ "!optimum-cli export openvino --model gpt2 ov_model" ] }, + { + "cell_type": "markdown", + "id": "0f7a6d21", + "metadata": {}, + "source": [ + "It is recommended to apply 8 or 4-bit weight quantization to reduce inference latency and model footprint using `--weight-format`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97088ea0", + "metadata": {}, + "outputs": [], + "source": [ + "!optimum-cli export openvino --model gpt2 --weight-format int8 ov_model # for 8-bit quantization\n", + "\n", + "!optimum-cli export openvino --model gpt2 --weight-format int4 ov_model # for 4-bit quantization" + ] + }, { "cell_type": "code", "execution_count": null, @@ -280,6 +300,38 @@ "\n", "print(ov_chain.invoke({\"question\": question}))" ] + }, + { + "cell_type": "markdown", + "id": "a2c5726c", + "metadata": {}, + "source": [ + "You can get additional inference speed improvement with Dynamic Quantization of activations and KV-cache quantization. These options can be enabled with `ov_config` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1f9c2c5", + "metadata": {}, + "outputs": [], + "source": [ + "ov_config = {\n", + " \"KV_CACHE_PRECISION\": \"u8\",\n", + " \"DYNAMIC_QUANTIZATION_GROUP_SIZE\": \"32\",\n", + " \"PERFORMANCE_HINT\": \"LATENCY\",\n", + " \"NUM_STREAMS\": \"1\",\n", + " \"CACHE_DIR\": \"\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "da9a9239", + "metadata": {}, + "source": [ + "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/openvino-workflow/generative-ai-models-guide.html)." + ] } ], "metadata": {