From a7500ab0fbf6e9d00bcee9ada8bd5fb5d9e539c1 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov <alexander.kozlov@intel.com>
Date: Tue, 12 Mar 2024 00:00:31 +0400
Subject: [PATCH] docs: Update huggingface pipelines notebook (#18801)

---
 .../llms/huggingface_pipelines.ipynb          | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/docs/docs/integrations/llms/huggingface_pipelines.ipynb b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
index 4c07c06e03..6f48849b57 100644
--- a/docs/docs/integrations/llms/huggingface_pipelines.ipynb
+++ b/docs/docs/integrations/llms/huggingface_pipelines.ipynb
@@ -259,6 +259,26 @@
     "!optimum-cli export openvino --model gpt2 ov_model"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "0f7a6d21",
+   "metadata": {},
+   "source": [
+    "It is recommended to apply 8 or 4-bit weight quantization to reduce inference latency and model footprint using `--weight-format`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97088ea0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!optimum-cli export openvino --model gpt2  --weight-format int8 ov_model # for 8-bit quantization\n",
+    "\n",
+    "!optimum-cli export openvino --model gpt2  --weight-format int4 ov_model # for 4-bit quantization"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -280,6 +300,38 @@
     "\n",
     "print(ov_chain.invoke({\"question\": question}))"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2c5726c",
+   "metadata": {},
+   "source": [
+    "You can get additional inference speed improvement with Dynamic Quantization of activations and KV-cache quantization. These options can be enabled with `ov_config` as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1f9c2c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ov_config = {\n",
+    "    \"KV_CACHE_PRECISION\": \"u8\",\n",
+    "    \"DYNAMIC_QUANTIZATION_GROUP_SIZE\": \"32\",\n",
+    "    \"PERFORMANCE_HINT\": \"LATENCY\",\n",
+    "    \"NUM_STREAMS\": \"1\",\n",
+    "    \"CACHE_DIR\": \"\",\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "da9a9239",
+   "metadata": {},
+   "source": [
+    "For more information refer to [OpenVINO LLM guide](https://docs.openvino.ai/2024/openvino-workflow/generative-ai-models-guide.html)."
+   ]
   }
  ],
  "metadata": {