Add prompt hub for various use-cases (#9879)

Use prompt hub in our use-case docs and guides.
1 year ago · 16a27ab244
parent 00a7c31ffd
commit 16a27ab244
5 changed files with 335 additions and 478 deletions
--- a/docs/extras/guides/local_llms.ipynb
+++ b/docs/extras/guides/local_llms.ipynb
@ -264,88 +264,19 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "pip install llama-cpp-python"
+    "CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dirclear"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
-   "id": "9d5f94b5",
+   "execution_count": null,
+   "id": "a88bf0c8-e989-4bcd-bcb7-4d7757e684f2",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "objc[10142]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2a0c4c208) and /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/libllama.dylib (0x2c28bc208). One of the two will be used. Which one is undefined.\n",
-      "llama.cpp: loading model from /Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
-      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
-      "llama_model_load_internal: n_vocab    = 32000\n",
-      "llama_model_load_internal: n_ctx      = 2048\n",
-      "llama_model_load_internal: n_embd     = 5120\n",
-      "llama_model_load_internal: n_mult     = 256\n",
-      "llama_model_load_internal: n_head     = 40\n",
-      "llama_model_load_internal: n_layer    = 40\n",
-      "llama_model_load_internal: n_rot      = 128\n",
-      "llama_model_load_internal: freq_base  = 10000.0\n",
-      "llama_model_load_internal: freq_scale = 1\n",
-      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-      "llama_model_load_internal: n_ff       = 13824\n",
-      "llama_model_load_internal: model size = 13B\n",
-      "llama_model_load_internal: ggml ctx size =    0.09 MB\n",
-      "llama_model_load_internal: mem required  = 8953.71 MB (+ 1608.00 MB per state)\n",
-      "llama_new_context_with_model: kv self size  = 1600.00 MB\n",
-      "ggml_metal_init: allocating\n",
-      "ggml_metal_init: using MPS\n",
-      "ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x47774af60\n",
-      "ggml_metal_init: loaded kernel_mul                            0x47774bc00\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x47774c230\n",
-      "ggml_metal_init: loaded kernel_scale                          0x47774c890\n",
-      "ggml_metal_init: loaded kernel_silu                           0x47774cef0\n",
-      "ggml_metal_init: loaded kernel_relu                           0x10e33e500\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x47774b2f0\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x47771a580\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x47774dab0\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x47774e110\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x47774e7d0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x13efd7170\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x13efd73d0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x13efd7630\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x13efd7890\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x4744c9740\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x4744ca6b0\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x4744cb250\n",
-      "ggml_metal_init: loaded kernel_norm                           0x4744cb970\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x10e33f700\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x10e33fcd0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x4744cc2d0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x4744cc6f0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x4744cd6b0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x4744cde20\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x10e33ff30\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x10e340190\n",
-      "ggml_metal_init: loaded kernel_rope                           0x10e3403f0\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x10e340de0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x10e3416d0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x10e342080\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x10e342ca0\n",
-      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
-      "ggml_metal_init: hasUnifiedMemory             = true\n",
-      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, ( 6986.19 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =  1032.00 MB, ( 8018.19 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  1602.00 MB, ( 9620.19 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   426.00 MB, (10046.19 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB, (10558.19 / 21845.34)\n",
-      "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from langchain.llms import LlamaCpp\n",
    "llm = LlamaCpp(\n",
-    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
+    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/models/openorca-platypus2-13b.gguf.q4_0.bin\",\n",
    "    n_gpu_layers=1,\n",
    "    n_batch=512,\n",
    "    n_ctx=2048,\n",
@ -448,87 +379,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
-   "id": "b55a2147",
+   "execution_count": null,
+   "id": "915ecd4c-8f6b-4de3-a787-b64cb7c682b4",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found model file at  /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
-      "llama_new_context_with_model: max tensor size =    87.89 MB\n",
-      "llama_new_context_with_model: max tensor size =    87.89 MB\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "llama.cpp: using Metal\n",
-      "llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
-      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
-      "llama_model_load_internal: n_vocab    = 32001\n",
-      "llama_model_load_internal: n_ctx      = 2048\n",
-      "llama_model_load_internal: n_embd     = 5120\n",
-      "llama_model_load_internal: n_mult     = 256\n",
-      "llama_model_load_internal: n_head     = 40\n",
-      "llama_model_load_internal: n_layer    = 40\n",
-      "llama_model_load_internal: n_rot      = 128\n",
-      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-      "llama_model_load_internal: n_ff       = 13824\n",
-      "llama_model_load_internal: n_parts    = 1\n",
-      "llama_model_load_internal: model size = 13B\n",
-      "llama_model_load_internal: ggml ctx size =    0.09 MB\n",
-      "llama_model_load_internal: mem required  = 9031.71 MB (+ 1608.00 MB per state)\n",
-      "llama_new_context_with_model: kv self size  = 1600.00 MB\n",
-      "ggml_metal_init: allocating\n",
-      "ggml_metal_init: using MPS\n",
-      "ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x37944d850\n",
-      "ggml_metal_init: loaded kernel_mul                            0x37944f350\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x37944fdd0\n",
-      "ggml_metal_init: loaded kernel_scale                          0x3794505a0\n",
-      "ggml_metal_init: loaded kernel_silu                           0x379450800\n",
-      "ggml_metal_init: loaded kernel_relu                           0x379450a60\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x379450cc0\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x379450ff0\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x379451250\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x3794514b0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x379451710\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x379451970\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_k                  0x379451bd0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_k                  0x379451e30\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_k                  0x379452090\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_k                  0x3794522f0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_k                  0x379452550\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x3794527b0\n",
-      "ggml_metal_init: loaded kernel_norm                           0x379452a10\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x379452c70\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x379452ed0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x379453130\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_k_f32               0x379453390\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_k_f32               0x3794535f0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_k_f32               0x379453850\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_k_f32               0x379453ab0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_k_f32               0x379453d10\n",
-      "ggml_metal_init: loaded kernel_rope                           0x379453f70\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x3794541d0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x379454430\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x379454690\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x3794548f0\n",
-      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
-      "ggml_metal_init: hasUnifiedMemory             = true\n",
-      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, (17542.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =  1024.00 MB, (18566.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  1602.00 MB, (20168.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   512.00 MB, (20680.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB, (21192.94 / 21845.34)\n",
-      "ggml_metal_free: deallocating\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from langchain.llms import GPT4All\n",
    "llm = GPT4All(model=\"/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\")"
@ -564,89 +418,21 @@
    "\n",
    "Some LLMs will benefit from specific prompts.\n",
    "\n",
-    "For example, llama2 can use [special tokens](https://twitter.com/RLanceMartin/status/1681879318493003776?s=20).\n",
+    "For example, LLaMA will use [special tokens](https://twitter.com/RLanceMartin/status/1681879318493003776?s=20).\n",
    "\n",
    "We can use `ConditionalPromptSelector` to set prompt based on the model type."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
-   "id": "d082b10a",
+   "execution_count": null,
+   "id": "16759b7c-7903-4269-b7b4-f83b313d8091",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "llama.cpp: loading model from /Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
-      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
-      "llama_model_load_internal: n_vocab    = 32000\n",
-      "llama_model_load_internal: n_ctx      = 2048\n",
-      "llama_model_load_internal: n_embd     = 5120\n",
-      "llama_model_load_internal: n_mult     = 256\n",
-      "llama_model_load_internal: n_head     = 40\n",
-      "llama_model_load_internal: n_layer    = 40\n",
-      "llama_model_load_internal: n_rot      = 128\n",
-      "llama_model_load_internal: freq_base  = 10000.0\n",
-      "llama_model_load_internal: freq_scale = 1\n",
-      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-      "llama_model_load_internal: n_ff       = 13824\n",
-      "llama_model_load_internal: model size = 13B\n",
-      "llama_model_load_internal: ggml ctx size =    0.09 MB\n",
-      "llama_model_load_internal: mem required  = 8953.71 MB (+ 1608.00 MB per state)\n",
-      "llama_new_context_with_model: kv self size  = 1600.00 MB\n",
-      "ggml_metal_init: allocating\n",
-      "ggml_metal_init: using MPS\n",
-      "ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x4744d09d0\n",
-      "ggml_metal_init: loaded kernel_mul                            0x3781cb3d0\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x37813bb60\n",
-      "ggml_metal_init: loaded kernel_scale                          0x474481080\n",
-      "ggml_metal_init: loaded kernel_silu                           0x4744d29f0\n",
-      "ggml_metal_init: loaded kernel_relu                           0x3781254c0\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x47447f280\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x4744cf470\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x4744cf6d0\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x4744cf930\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x4744cfb90\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x4744cfdf0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x4744d0050\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x4744ce980\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x4744cebe0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x4744cee40\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x4744cf0a0\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x474482450\n",
-      "ggml_metal_init: loaded kernel_norm                           0x4744826b0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x474482910\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x474482b70\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x474482dd0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x474483030\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x474483290\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x4744834f0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x474483750\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x4744839b0\n",
-      "ggml_metal_init: loaded kernel_rope                           0x474483c10\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x474483e70\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x4744840d0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x474484330\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x474484590\n",
-      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
-      "ggml_metal_init: hasUnifiedMemory             = true\n",
-      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, ( 6986.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =  1032.00 MB, ( 8018.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  1602.00 MB, ( 9620.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   426.00 MB, (10046.94 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB, (10558.94 / 21845.34)\n",
-      "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Set our LLM\n",
    "llm = LlamaCpp(\n",
-    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
+    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/models/openorca-platypus2-13b.gguf.q4_0.bin\",\n",
    "    n_gpu_layers=1,\n",
    "    n_batch=512,\n",
    "    n_ctx=2048,\n",
@ -661,7 +447,7 @@
   "id": "66656084",
   "metadata": {},
   "source": [
-    "Set the associated prompt."
+    "Set the associated prompt based upon the model version."
   ]
  },
  {
@ -759,6 +545,18 @@
    "llm_chain.run({\"question\":question})"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "6e0d37e7-f1d9-4848-bf2c-c22392ee141f",
+   "metadata": {},
+   "source": [
+    "We also can use the LangChain Prompt Hub to fetch and / or store prompts that are model specific.\n",
+    "\n",
+    "This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
+    "\n",
+    "For example, [here](https://smith.langchain.com/hub/rlm/rag-prompt-llama) is a prompt for RAG with LLaMA-specific tokens."
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "6ba66260",
@ -770,16 +568,12 @@
    "\n",
    "For example, here is a guide to [RAG](docs/use_cases/question_answering/how_to/local_retrieval_qa) with local LLMs.\n",
    "\n",
-    "In general, use cases for local model can be driven by at least two factors:\n",
+    "In general, use cases for local LLMs can be driven by at least two factors:\n",
    "\n",
    "* `Privacy`: private data (e.g., journals, etc) that a user does not want to share \n",
    "* `Cost`: text preprocessing (extraction/tagging), summarization, and agent simulations are token-use-intensive tasks\n",
    "\n",
-    "There are a few approach to support specific use-cases: \n",
-    "\n",
-    "* Fine-tuning (e.g., [gpt-llm-trainer](https://github.com/mshumer/gpt-llm-trainer), [Anyscale](https://www.anyscale.com/blog/fine-tuning-llama-2-a-comprehensive-case-study-for-tailoring-models-to-unique-applications)) \n",
-    "* [Function-calling](https://github.com/MeetKai/functionary/tree/main) for use-cases like extraction or tagging\n",
-    "\n"
+    "In addition, [here](https://blog.langchain.dev/using-langsmith-to-support-fine-tuning-of-open-source-llms/) is an overview on fine-tuning, which can utilize open source LLMs."
   ]
  }
 ],
--- a/docs/extras/use_cases/code_understanding.ipynb
+++ b/docs/extras/use_cases/code_understanding.ipynb
@ -921,6 +921,48 @@
    "llm(\"Question: In bash, how do I list all the text files in the current directory that have been modified in the last month? Answer:\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains.question_answering import load_qa_chain\n",
+    "\n",
+    "# Prompt\n",
+    "template = \"\"\"Use the following pieces of context to answer the question at the end. \n",
+    "If you don't know the answer, just say that you don't know, don't try to make up an answer. \n",
+    "Use three sentences maximum and keep the answer as concise as possible. \n",
+    "{context}\n",
+    "Question: {question}\n",
+    "Helpful Answer:\"\"\"\n",
+    "QA_CHAIN_PROMPT = PromptTemplate(\n",
+    "    input_variables=[\"context\", \"question\"],\n",
+    "    template=template,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also use the LangChain Prompt Hub to store and fetch prompts.\n",
+    "\n",
+    "This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
+    "\n",
+    "Let's try with a default RAG prompt, [here](https://smith.langchain.com/hub/rlm/rag-prompt)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "QA_CHAIN_PROMPT = hub.pull(\"rlm/rag-prompt-default\")"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 29,
@ -970,20 +1012,6 @@
    }
   ],
   "source": [
-    "from langchain.chains.question_answering import load_qa_chain\n",
-    "\n",
-    "# Prompt\n",
-    "template = \"\"\"Use the following pieces of context to answer the question at the end. \n",
-    "If you don't know the answer, just say that you don't know, don't try to make up an answer. \n",
-    "Use three sentences maximum and keep the answer as concise as possible. \n",
-    "{context}\n",
-    "Question: {question}\n",
-    "Helpful Answer:\"\"\"\n",
-    "QA_CHAIN_PROMPT = PromptTemplate(\n",
-    "    input_variables=[\"context\", \"question\"],\n",
-    "    template=template,\n",
-    ")\n",
-    "\n",
    "# Docs\n",
    "question = \"How can I initialize a ReAct agent?\"\n",
    "docs = retriever.get_relevant_documents(question)\n",
--- a/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb
+++ b/docs/extras/use_cases/question_answering/how_to/local_retrieval_qa.ipynb
@ -11,7 +11,9 @@
    "\n",
    "LangChain has [integrations](https://integrations.langchain.com/) with many open source LLMs that can be run locally.\n",
    "\n",
-    "For example, here we show how to run `GPT4All` or `Llama-v2` locally (e.g., on your laptop) using local embeddings and a local LLM.\n",
+    "See [here](docs/guides/local_llms) for setup instructions for these LLMs. \n",
+    "\n",
+    "For example, here we show how to run `GPT4All` or `LLaMA2` locally (e.g., on your laptop) using local embeddings and a local LLM.\n",
    "\n",
    "## Document Loading \n",
    "\n",
@ -25,7 +27,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "pip install gpt4all chromadb"
+    "pip install gpt4all chromadb langchainhub"
   ]
  },
  {
@ -40,7 +42,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 3,
   "id": "f8cf5765",
   "metadata": {},
   "outputs": [],
@ -66,7 +68,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 5,
   "id": "fdce8923",
   "metadata": {},
   "outputs": [
@ -76,6 +78,13 @@
     "text": [
      "Found model file at  /Users/rlm/.cache/gpt4all/ggml-all-MiniLM-L6-v2-f16.bin\n"
     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "objc[31511]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x14f4e8208) and /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x14f5fc208). One of the two will be used. Which one is undefined.\n"
+     ]
    }
   ],
   "source": [
@ -95,7 +104,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 6,
   "id": "b0c55e98",
   "metadata": {},
   "outputs": [
@ -105,7 +114,7 @@
       "4"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -118,7 +127,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
   "id": "32b43339",
   "metadata": {},
   "outputs": [
@ -128,7 +137,7 @@
       "Document(page_content='Task decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\\\n1.\", \"What are the subgoals for achieving XYZ?\", (2) by using task-specific instructions; e.g. \"Write a story outline.\" for writing a novel, or (3) with human inputs.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': \"LLM Powered Autonomous Agents | Lil'Log\"})"
      ]
     },
-     "execution_count": 11,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -144,9 +153,15 @@
   "source": [
    "## Model \n",
    "\n",
-    "### Llama-v2\n",
+    "### LLaMA2\n",
    "\n",
-    "Download a GGML converted model (e.g., [here](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main))."
+    "Note: new versions of `llama-cpp-python` use GGUF model files (see [here](https://github.com/abetlen/llama-cpp-python/pull/633)).\n",
+    "\n",
+    "If you have an existing GGML model, see [here](docs/integrations/llms/llamacpp) for instructions for conversion for GGUF. \n",
+    "   \n",
+    "And / or, you can download a GGUF converted model (e.g., [here](https://huggingface.co/TheBloke)).\n",
+    "\n",
+    "Finally, as noted in detail [here](docs/guides/local_llms) install `llama-cpp-python`"
   ]
  },
  {
@ -180,16 +195,16 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "2fd6fe25",
+   "id": "5884779a-957e-4c4c-b447-bc8385edc67e",
   "metadata": {},
   "outputs": [],
   "source": [
-    "! CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir"
+    "! CMAKE_ARGS=\"-DLLAMA_METAL=on\" FORCE_CMAKE=1 /Users/rlm/miniforge3/envs/llama/bin/pip install -U llama-cpp-python --no-cache-dir"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "cd7164e3",
   "metadata": {},
   "outputs": [],
@ -209,78 +224,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
-   "id": "74718579",
+   "execution_count": null,
+   "id": "56158f83-6490-49b8-9f04-2e2e6ec3524b",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "llama.cpp: loading model from /Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\n",
-      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
-      "llama_model_load_internal: n_vocab    = 32000\n",
-      "llama_model_load_internal: n_ctx      = 2048\n",
-      "llama_model_load_internal: n_embd     = 5120\n",
-      "llama_model_load_internal: n_mult     = 256\n",
-      "llama_model_load_internal: n_head     = 40\n",
-      "llama_model_load_internal: n_layer    = 40\n",
-      "llama_model_load_internal: n_rot      = 128\n",
-      "llama_model_load_internal: freq_base  = 10000.0\n",
-      "llama_model_load_internal: freq_scale = 1\n",
-      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-      "llama_model_load_internal: n_ff       = 13824\n",
-      "llama_model_load_internal: model size = 13B\n",
-      "llama_model_load_internal: ggml ctx size =    0.09 MB\n",
-      "llama_model_load_internal: mem required  = 8819.71 MB (+ 1608.00 MB per state)\n",
-      "llama_new_context_with_model: kv self size  = 1600.00 MB\n",
-      "ggml_metal_init: allocating\n",
-      "ggml_metal_init: using MPS\n",
-      "ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/llama_cpp/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x76add7460\n",
-      "ggml_metal_init: loaded kernel_mul                            0x76add5090\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x76addae00\n",
-      "ggml_metal_init: loaded kernel_scale                          0x76adb2940\n",
-      "ggml_metal_init: loaded kernel_silu                           0x76adb8610\n",
-      "ggml_metal_init: loaded kernel_relu                           0x76addb700\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x76addc100\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x76addcb80\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x76addd600\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x295f16380\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x295f165e0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x295f16840\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_K                  0x295f16aa0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_K                  0x295f16d00\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_K                  0x295f16f60\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_K                  0x295f171c0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_K                  0x295f17420\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x295f17680\n",
-      "ggml_metal_init: loaded kernel_norm                           0x295f178e0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x295f17b40\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x295f17da0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x295f18000\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_K_f32               0x7962b9900\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_K_f32               0x7962bf5f0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_K_f32               0x7962bc630\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_K_f32               0x142045960\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_K_f32               0x7962ba2b0\n",
-      "ggml_metal_init: loaded kernel_rope                           0x7962c35f0\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x7962c30b0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x7962c15b0\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x7962beb10\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x7962bf060\n",
-      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
-      "ggml_metal_init: hasUnifiedMemory             = true\n",
-      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, (35852.94 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =  1026.00 MB, (36878.94 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  1602.00 MB, (38480.94 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   298.00 MB, (38778.94 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n",
-      "AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | \n",
-      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB, (39290.94 / 21845.34), warning: current allocated size is greater than the recommended max working set size\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "n_gpu_layers = 1  # Metal set to 1 is enough.\n",
    "n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.\n",
@ -288,7 +235,7 @@
    "\n",
    "# Make sure the model path is correct for your system!\n",
    "llm = LlamaCpp(\n",
-    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin\",\n",
+    "    model_path=\"/Users/rlm/Desktop/Code/llama.cpp/models/llama-2-13b-chat.ggufv3.q4_0.bin\",\n",
    "    n_gpu_layers=n_gpu_layers,\n",
    "    n_batch=n_batch,\n",
    "    n_ctx=2048,\n",
@ -313,8 +260,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
-   "id": "e940de71",
+   "execution_count": 11,
+   "id": "bf0162e0-8c41-4344-88ae-ff2bbaeb12eb",
   "metadata": {},
   "outputs": [
    {
@ -328,13 +275,17 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "by jonathan \n",
+      "\n",
+      "Here's the hypothetical rap battle:\n",
+      "\n",
+      "[Stephen Colbert]: Yo, this is Stephen Colbert, known for my comedy show. I'm here to put some sense in your mind, like an enema do-go. Your opponent? A man of laughter and witty quips, John Oliver! Now let's see who gets the most laughs while taking shots at each other\n",
      "\n",
-      "Setting: The Late Show with Stephen Colbert. The studio audience is filled with fans of both comedians, and the energy is electric. The two comedians are seated at a table, ready to begin their epic rap battle.\n",
+      "[John Oliver]: Yo, this is John Oliver, known for my own comedy show. I'm here to take your mind on an adventure through wit and humor. But first, allow me to you to our contestant: Stephen Colbert! His show has been around since the '90s, but it's time to see who can out-rap whom\n",
      "\n",
-      "Stephen Colbert: (smirking) Oh, you think you can take me down, John? You're just a Brit with a funny accent, and I'm the king of comedy!\n",
-      "John Oliver: (grinning) Oh, you think you're tough, Stephen? You're just a has-been from South Carolina, and I'm the future of comedy!\n",
-      "The battle begins, with each comedian delivering clever rhymes and witty insults. Here are a few lines that might be included:\n",
-      "Stephen Colbert: (rapping) You may have a big brain, John, but you can't touch my charm / I've got the audience in stitches, while you're just a blemish on the screen / Your accent is so thick, it's like trying to hear a speech through a mouthful of marshmallows / You may have"
+      "[Stephen Colbert]: You claim to be a witty man, John Oliver, with your British charm and clever remarks. But my knows that I'm America's funnyman! Who's the one taking you? Nobody!\n",
+      "\n",
+      "[John Oliver]: Hey Stephen Colbert, don't get too cocky. You may"
     ]
    },
    {
@ -342,29 +293,26 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "llama_print_timings:        load time =  2201.54 ms\n",
-      "llama_print_timings:      sample time =   182.54 ms /   256 runs   (    0.71 ms per token,  1402.41 tokens per second)\n",
-      "llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
-      "llama_print_timings:        eval time =  8484.62 ms /   256 runs   (   33.14 ms per token,    30.17 tokens per second)\n",
-      "llama_print_timings:       total time =  9000.62 ms\n"
+      "llama_print_timings:        load time =  4481.74 ms\n",
+      "llama_print_timings:      sample time =   183.05 ms /   256 runs   (    0.72 ms per token,  1398.53 tokens per second)\n",
+      "llama_print_timings: prompt eval time =   456.05 ms /    13 tokens (   35.08 ms per token,    28.51 tokens per second)\n",
+      "llama_print_timings:        eval time =  7375.20 ms /   255 runs   (   28.92 ms per token,    34.58 tokens per second)\n",
+      "llama_print_timings:       total time =  8388.92 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "\"\\nSetting: The Late Show with Stephen Colbert. The studio audience is filled with fans of both comedians, and the energy is electric. The two comedians are seated at a table, ready to begin their epic rap battle.\\n\\nStephen Colbert: (smirking) Oh, you think you can take me down, John? You're just a Brit with a funny accent, and I'm the king of comedy!\\nJohn Oliver: (grinning) Oh, you think you're tough, Stephen? You're just a has-been from South Carolina, and I'm the future of comedy!\\nThe battle begins, with each comedian delivering clever rhymes and witty insults. Here are a few lines that might be included:\\nStephen Colbert: (rapping) You may have a big brain, John, but you can't touch my charm / I've got the audience in stitches, while you're just a blemish on the screen / Your accent is so thick, it's like trying to hear a speech through a mouthful of marshmallows / You may have\""
+       "\"by jonathan \\n\\nHere's the hypothetical rap battle:\\n\\n[Stephen Colbert]: Yo, this is Stephen Colbert, known for my comedy show. I'm here to put some sense in your mind, like an enema do-go. Your opponent? A man of laughter and witty quips, John Oliver! Now let's see who gets the most laughs while taking shots at each other\\n\\n[John Oliver]: Yo, this is John Oliver, known for my own comedy show. I'm here to take your mind on an adventure through wit and humor. But first, allow me to you to our contestant: Stephen Colbert! His show has been around since the '90s, but it's time to see who can out-rap whom\\n\\n[Stephen Colbert]: You claim to be a witty man, John Oliver, with your British charm and clever remarks. But my knows that I'm America's funnyman! Who's the one taking you? Nobody!\\n\\n[John Oliver]: Hey Stephen Colbert, don't get too cocky. You may\""
      ]
     },
-     "execution_count": 30,
+     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "prompt = \"\"\"\n",
-    "Question: A rap battle between Stephen Colbert and John Oliver\n",
-    "\"\"\"\n",
-    "llm(prompt)"
+    "llm(\"Simulate a rap battle between Stephen Colbert and John Oliver\")"
   ]
  },
  {
@ -389,85 +337,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "id": "4a24eef1",
+   "execution_count": null,
+   "id": "57c1aec0-04c7-479e-b9bf-af3c547ba0a3",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found model file at  /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "objc[47842]: Class GGMLMetalClass is implemented in both /Users/rlm/anaconda3/envs/lcn2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x29f48c208) and /Users/rlm/anaconda3/envs/lcn2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x29f970208). One of the two will be used. Which one is undefined.\n",
-      "llama.cpp: using Metal\n",
-      "llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
-      "llama_model_load_internal: format     = ggjt v3 (latest)\n",
-      "llama_model_load_internal: n_vocab    = 32001\n",
-      "llama_model_load_internal: n_ctx      = 2048\n",
-      "llama_model_load_internal: n_embd     = 5120\n",
-      "llama_model_load_internal: n_mult     = 256\n",
-      "llama_model_load_internal: n_head     = 40\n",
-      "llama_model_load_internal: n_layer    = 40\n",
-      "llama_model_load_internal: n_rot      = 128\n",
-      "llama_model_load_internal: ftype      = 2 (mostly Q4_0)\n",
-      "llama_model_load_internal: n_ff       = 13824\n",
-      "llama_model_load_internal: n_parts    = 1\n",
-      "llama_model_load_internal: model size = 13B\n",
-      "llama_model_load_internal: ggml ctx size =    0.09 MB\n",
-      "llama_model_load_internal: mem required  = 9031.71 MB (+ 1608.00 MB per state)\n",
-      "llama_new_context_with_model: kv self size  = 1600.00 MB\n",
-      "ggml_metal_init: allocating\n",
-      "ggml_metal_init: using MPS\n",
-      "ggml_metal_init: loading '/Users/rlm/anaconda3/envs/lcn2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
-      "ggml_metal_init: loaded kernel_add                            0x115fcbfb0\n",
-      "ggml_metal_init: loaded kernel_mul                            0x115fcd4a0\n",
-      "ggml_metal_init: loaded kernel_mul_row                        0x115fce850\n",
-      "ggml_metal_init: loaded kernel_scale                          0x115fcd700\n",
-      "ggml_metal_init: loaded kernel_silu                           0x115fcd960\n",
-      "ggml_metal_init: loaded kernel_relu                           0x115fcfd50\n",
-      "ggml_metal_init: loaded kernel_gelu                           0x115fd03c0\n",
-      "ggml_metal_init: loaded kernel_soft_max                       0x115fcf640\n",
-      "ggml_metal_init: loaded kernel_diag_mask_inf                  0x115fd07f0\n",
-      "ggml_metal_init: loaded kernel_get_rows_f16                   0x1147b2450\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_0                  0x11479d1d0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_1                  0x1147ad1f0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q2_k                  0x1147aef50\n",
-      "ggml_metal_init: loaded kernel_get_rows_q3_k                  0x1147af1b0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q4_k                  0x1147af410\n",
-      "ggml_metal_init: loaded kernel_get_rows_q5_k                  0x1147affa0\n",
-      "ggml_metal_init: loaded kernel_get_rows_q6_k                  0x1147b0200\n",
-      "ggml_metal_init: loaded kernel_rms_norm                       0x1147b0460\n",
-      "ggml_metal_init: loaded kernel_norm                           0x1147bfc90\n",
-      "ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x1147c0230\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x1147c0490\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_1_f32               0x1147c06f0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q2_k_f32               0x1147c0950\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q3_k_f32               0x1147c0bb0\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q4_k_f32               0x1147c0e10\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q5_k_f32               0x1147c1070\n",
-      "ggml_metal_init: loaded kernel_mul_mat_q6_k_f32               0x1147c13d0\n",
-      "ggml_metal_init: loaded kernel_rope                           0x1147c1a00\n",
-      "ggml_metal_init: loaded kernel_alibi_f32                      0x1147c2120\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f16                    0x115fd1690\n",
-      "ggml_metal_init: loaded kernel_cpy_f32_f32                    0x115fd1c60\n",
-      "ggml_metal_init: loaded kernel_cpy_f16_f16                    0x115fd2d40\n",
-      "ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
-      "ggml_metal_init: hasUnifiedMemory             = true\n",
-      "ggml_metal_init: maxTransferRate              = built-in GPU\n",
-      "ggml_metal_add_buffer: allocated 'data            ' buffer, size =  6984.06 MB, ( 6984.45 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'eval            ' buffer, size =  1024.00 MB, ( 8008.45 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'kv              ' buffer, size =  1602.00 MB, ( 9610.45 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   512.00 MB, (10122.45 / 21845.34)\n",
-      "ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB, (10634.45 / 21845.34)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "from langchain.llms import GPT4All\n",
    "\n",
@ -495,9 +368,7 @@
   "cell_type": "code",
   "execution_count": 27,
   "id": "18a3716d",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
@ -573,9 +444,32 @@
    "`chain_type=\"stuff\"` (see [here](https://python.langchain.com/docs/modules/chains/document/stuff)) means that all the docs will be added (stuffed) into a prompt."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "3cce6977-52e7-4944-89b4-c161d04f6698",
+   "metadata": {},
+   "source": [
+    "We can also use the LangChain Prompt Hub to store and fetch prompts that are model-specific.\n",
+    "\n",
+    "This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
+    "\n",
+    "Let's try with a default RAG prompt, [here](https://smith.langchain.com/hub/rlm/rag-prompt)."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
+   "id": "4ae37573-63a7-4564-90e1-196a8ea9b526",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "rag_prompt = hub.pull(\"rlm/rag-prompt-default\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
   "id": "c01c1725",
   "metadata": {},
   "outputs": [
@ -590,7 +484,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      " Hi there! There are three main approaches to task decomposition. One is using LLM with simple prompting like \"Steps for XYZ.\" or \"What are the subgoals for achieving XYZ?\" Another approach is by using task-specific instructions, such as \"Write a story outline\" for writing a novel. Finally, task decomposition can also be done with human inputs. Thanks for asking!"
+      "\n",
+      "Task can be done by down a task into smaller subtasks, using simple prompting like \"Steps for XYZ.\" or task-specific like \"Write a story outline\" for writing a novel."
     ]
    },
    {
@ -598,43 +493,114 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "llama_print_timings:        load time =  1191.88 ms\n",
-      "llama_print_timings:      sample time =    61.21 ms /    85 runs   (    0.72 ms per token,  1388.64 tokens per second)\n",
-      "llama_print_timings: prompt eval time =  8014.11 ms /   267 tokens (   30.02 ms per token,    33.32 tokens per second)\n",
-      "llama_print_timings:        eval time =  2908.17 ms /    84 runs   (   34.62 ms per token,    28.88 tokens per second)\n",
-      "llama_print_timings:       total time = 11096.23 ms\n"
+      "llama_print_timings:        load time = 11326.20 ms\n",
+      "llama_print_timings:      sample time =    33.03 ms /    47 runs   (    0.70 ms per token,  1422.86 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1387.31 ms /   242 tokens (    5.73 ms per token,   174.44 tokens per second)\n",
+      "llama_print_timings:        eval time =  1321.62 ms /    46 runs   (   28.73 ms per token,    34.81 tokens per second)\n",
+      "llama_print_timings:       total time =  2801.08 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "{'output_text': ' Hi there! There are three main approaches to task decomposition. One is using LLM with simple prompting like \"Steps for XYZ.\" or \"What are the subgoals for achieving XYZ?\" Another approach is by using task-specific instructions, such as \"Write a story outline\" for writing a novel. Finally, task decomposition can also be done with human inputs. Thanks for asking!'}"
+       "{'output_text': '\\nTask can be done by down a task into smaller subtasks, using simple prompting like \"Steps for XYZ.\" or task-specific like \"Write a story outline\" for writing a novel.'}"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from langchain.chains.question_answering import load_qa_chain\n",
-    "\n",
-    "# Prompt\n",
-    "template = \"\"\"Use the following pieces of context to answer the question at the end. \n",
-    "If you don't know the answer, just say that you don't know, don't try to make up an answer. \n",
-    "Use three sentences maximum and keep the answer as concise as possible. \n",
-    "Always say \"thanks for asking!\" at the end of the answer. \n",
-    "{context}\n",
-    "Question: {question}\n",
-    "Helpful Answer:\"\"\"\n",
-    "QA_CHAIN_PROMPT = PromptTemplate(\n",
-    "    input_variables=[\"context\", \"question\"],\n",
-    "    template=template,\n",
-    ")\n",
-    "\n",
    "# Chain\n",
-    "chain = load_qa_chain(llm, chain_type=\"stuff\", prompt=QA_CHAIN_PROMPT)\n",
-    "\n",
+    "chain = load_qa_chain(llm, chain_type=\"stuff\", prompt=rag_prompt)\n",
+    "# Run\n",
+    "chain({\"input_documents\": docs, \"question\": question}, return_only_outputs=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e5913f0-cf92-4e21-8794-0502ba11b202",
+   "metadata": {},
+   "source": [
+    "Now, let's try with [a prompt specifically for LLaMA](https://smith.langchain.com/hub/rlm/rag-prompt-llama), which [includes special tokens](https://huggingface.co/blog/llama2#how-to-prompt-llama-2)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "78f6862d-b7a6-4e03-84e4-45667185bf9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatPromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, template=\"[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \\nQuestion: {question} \\nContext: {context} \\nAnswer: [/INST]\", template_format='f-string', validate_template=True), additional_kwargs={})])"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "rag_prompt_llama = hub.pull(\"rlm/rag-prompt-llama\")\n",
+    "rag_prompt_llama"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "67cefb46-acd3-4c2a-a8f6-b62c7c3e30dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Llama.generate: prefix-match hit\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Sure, I'd be happy to help! Based on the context, here are some to task:\n",
+      "\n",
+      "1. LLM with simple prompting: This using a large model (LLM) with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to decompose tasks into smaller steps.\n",
+      "2. Task-specific: Another is to use task-specific, such as \"Write a story outline\" for writing a novel, to guide the of tasks.\n",
+      "3. Human inputs:, human inputs can be used to supplement the process, in cases where the task a high degree of creativity or expertise.\n",
+      "\n",
+      "As fores in long-term and task, one major is that LLMs to adjust plans when faced with errors, making them less robust to humans who learn from trial and error."
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "llama_print_timings:        load time = 11326.20 ms\n",
+      "llama_print_timings:      sample time =   144.81 ms /   207 runs   (    0.70 ms per token,  1429.47 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1506.13 ms /   258 tokens (    5.84 ms per token,   171.30 tokens per second)\n",
+      "llama_print_timings:        eval time =  6231.92 ms /   206 runs   (   30.25 ms per token,    33.06 tokens per second)\n",
+      "llama_print_timings:       total time =  8158.41 ms\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'output_text': '  Sure, I\\'d be happy to help! Based on the context, here are some to task:\\n\\n1. LLM with simple prompting: This using a large model (LLM) with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to decompose tasks into smaller steps.\\n2. Task-specific: Another is to use task-specific, such as \"Write a story outline\" for writing a novel, to guide the of tasks.\\n3. Human inputs:, human inputs can be used to supplement the process, in cases where the task a high degree of creativity or expertise.\\n\\nAs fores in long-term and task, one major is that LLMs to adjust plans when faced with errors, making them less robust to humans who learn from trial and error.'}"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Chain\n",
+    "chain = load_qa_chain(llm, chain_type=\"stuff\", prompt=rag_prompt_llama)\n",
    "# Run\n",
    "chain({\"input_documents\": docs, \"question\": question}, return_only_outputs=True)"
   ]
@ -655,7 +621,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 29,
   "id": "86c7a349",
   "metadata": {},
   "outputs": [],
@ -665,13 +631,13 @@
    "qa_chain = RetrievalQA.from_chain_type(\n",
    "    llm,\n",
    "    retriever=vectorstore.as_retriever(),\n",
-    "    chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT},\n",
+    "    chain_type_kwargs={\"prompt\": rag_prompt_llama},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 30,
   "id": "112ca227",
   "metadata": {},
   "outputs": [
@ -686,8 +652,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      " \n",
-      "The three approaches to Task decomposition are LLMs with simple prompting, task-specific instructions, or human inputs. Thanks for asking!"
+      "  Sure! Based on the context, here's my answer to your:\n",
+      "\n",
+      "There are several to task,:\n",
+      "\n",
+      "1. LLM-based with simple prompting, such as \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\"\n",
+      "2. Task-specific, like \"Write a story outline\" for writing a novel.\n",
+      "3. Human inputs to guide the process.\n",
+      "\n",
+      "These can be used to decompose complex tasks into smaller, more manageable subtasks, which can help improve the and effectiveness of task. However, long-term and task can being due to the need to plan over a lengthy history and explore the space., LLMs may to adjust plans when faced with errors, making them less robust to human learners who can learn from trial and error."
     ]
    },
    {
@ -695,21 +668,21 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "llama_print_timings:        load time =  1191.88 ms\n",
-      "llama_print_timings:      sample time =    22.78 ms /    31 runs   (    0.73 ms per token,  1360.66 tokens per second)\n",
-      "llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\n",
-      "llama_print_timings:        eval time =  1320.23 ms /    31 runs   (   42.59 ms per token,    23.48 tokens per second)\n",
-      "llama_print_timings:       total time =  1387.70 ms\n"
+      "llama_print_timings:        load time = 11326.20 ms\n",
+      "llama_print_timings:      sample time =   139.20 ms /   200 runs   (    0.70 ms per token,  1436.76 tokens per second)\n",
+      "llama_print_timings: prompt eval time =  1532.26 ms /   258 tokens (    5.94 ms per token,   168.38 tokens per second)\n",
+      "llama_print_timings:        eval time =  5977.62 ms /   199 runs   (   30.04 ms per token,    33.29 tokens per second)\n",
+      "llama_print_timings:       total time =  7916.21 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'query': 'What are the approaches to Task Decomposition?',\n",
-       " 'result': ' \\nThe three approaches to Task decomposition are LLMs with simple prompting, task-specific instructions, or human inputs. Thanks for asking!'}"
+       " 'result': '  Sure! Based on the context, here\\'s my answer to your:\\n\\nThere are several to task,:\\n\\n1. LLM-based with simple prompting, such as \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\"\\n2. Task-specific, like \"Write a story outline\" for writing a novel.\\n3. Human inputs to guide the process.\\n\\nThese can be used to decompose complex tasks into smaller, more manageable subtasks, which can help improve the and effectiveness of task. However, long-term and task can being due to the need to plan over a lengthy history and explore the space., LLMs may to adjust plans when faced with errors, making them less robust to human learners who can learn from trial and error.'}"
      ]
     },
-     "execution_count": 22,
+     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
--- a/docs/extras/use_cases/sql.ipynb
+++ b/docs/extras/use_cases/sql.ipynb
@ -149,9 +149,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
@ -251,7 +249,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -280,6 +278,25 @@
    ")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also access this [prompt](https://smith.langchain.com/hub/rlm/text-to-sql) in the LangChain prompt hub.\n",
+    "\n",
+    "This will work with your [LangSmith API key](https://docs.smith.langchain.com/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "CUSTOM_PROMPT = hub.pull(\"rlm/text-to-sql\")"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -854,5 +871,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/docs/extras/use_cases/summarization.ipynb
+++ b/docs/extras/use_cases/summarization.ipynb
@ -220,6 +220,30 @@
    "map_chain = LLMChain(llm=llm, prompt=map_prompt)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "272ce8ce-919d-4ded-bbd5-a53a8a30bc66",
+   "metadata": {},
+   "source": [
+    "We can also use the Prompt Hub to store and fetch prompts.\n",
+    "\n",
+    "This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
+    "\n",
+    "For example, see the map prompt [here](https://smith.langchain.com/hub/rlm/map-prompt)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce48b805-d98b-4e0f-8b9e-3b3e72cad3d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "map_prompt = hub.pull(\"rlm/map-prompt\")\n",
+    "map_chain = LLMChain(llm=llm, prompt=map_prompt)"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "bee3c331",
@ -232,8 +256,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "id": "1edb1b0d",
+   "execution_count": null,
+   "id": "6a718890-99ab-439a-8f79-b9ae9c58ad24",
   "metadata": {},
   "outputs": [],
   "source": [
@ -242,7 +266,28 @@
    "{doc_summaries}\n",
    "Take these and distill it into a final, consolidated summary of the main themes. \n",
    "Helpful Answer:\"\"\"\n",
-    "reduce_prompt = PromptTemplate.from_template(reduce_template)\n",
+    "reduce_prompt = PromptTemplate.from_template(reduce_template)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f189184a-673e-4530-8a6b-57b091045d87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note we can also get this from the prompt hub, as noted above\n",
+    "reduce_prompt = hub.pull(\"rlm/map-prompt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "1edb1b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run chain\n",
    "reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)\n",
    "\n",
    "# Takes a list of documents, combines them into a single string, and passes this to an LLMChain\n",
@ -503,7 +548,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.16"
  }
 },
 "nbformat": 4,