docs: add quantization to vllm and update API (#16950)

- **Description:** Update vLLM docs to include instructions on how to use quantized models, as well as to replace the deprecated methods.
8 months ago · 71f9ea33b6
parent 2a510c71a0
commit 71f9ea33b6
1 changed files with 34 additions and 6 deletions
--- a/docs/docs/integrations/llms/vllm.ipynb
+++ b/docs/docs/integrations/llms/vllm.ipynb
@ -82,7 +82,7 @@
    "    temperature=0.8,\n",
    ")\n",
    "\n",
-    "print(llm(\"What is the capital of France ?\"))"
+    "print(llm.invoke(\"What is the capital of France ?\"))"
   ]
  },
  {
@ -117,8 +117,7 @@
      "1. The first Pokemon game was released in 1996.\n",
      "2. The president was Bill Clinton.\n",
      "3. Clinton was president from 1993 to 2001.\n",
-      "4. The answer is Clinton.\n",
-      "\n"
+      "4. The answer is Clinton.\n"
     ]
    },
    {
@ -142,7 +141,7 @@
    "\n",
    "question = \"Who was the US president in the year the first Pokemon game was released?\"\n",
    "\n",
-    "print(llm_chain.run(question))"
+    "print(llm_chain.invoke(question))"
   ]
  },
  {
@ -172,7 +171,36 @@
    "    trust_remote_code=True,  # mandatory for hf models\n",
    ")\n",
    "\n",
-    "llm(\"What is the future of AI?\")"
+    "llm.invoke(\"What is the future of AI?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6ca8fd911d25faa",
+   "metadata": {
+    "collapsed": false
+   },
+   "source": [
+    "## Quantization\n",
+    "\n",
+    "vLLM supports `awq` quantization. To enable it, pass `quantization` to `vllm_kwargs`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cada3174c46a0ea",
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "llm_q = VLLM(\n",
+    "    model=\"TheBloke/Llama-2-7b-Chat-AWQ\",\n",
+    "    trust_remote_code=True,\n",
+    "    max_new_tokens=512,\n",
+    "    vllm_kwargs={\"quantization\": \"awq\"},\n",
+    ")"
   ]
  },
  {
@ -216,7 +244,7 @@
    "    model_name=\"tiiuae/falcon-7b\",\n",
    "    model_kwargs={\"stop\": [\".\"]},\n",
    ")\n",
-    "print(llm(\"Rome is\"))"
+    "print(llm.invoke(\"Rome is\"))"
   ]
  }
 ],