Add Exact match and Regex Match Evaluators (#11132)

10 months ago · 33da8bd711
parent e355606b11
commit 33da8bd711
25 changed files with 3641 additions and 3211 deletions
--- a/docs/extras/guides/evaluation/comparison/custom.ipynb
+++ b/docs/extras/guides/evaluation/comparison/custom.ipynb
@ -1,280 +1,281 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
-   "metadata": {},
-   "source": [
-    "# Custom Pairwise Evaluator\n",
-    "\n",
-    "You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
-    "\n",
-    "In this example, you will make a simple custom evaluator that just returns whether the first prediction has more whitespace tokenized 'words' than the second.\n",
-    "\n",
-    "You can check out the reference docs for the [PairwiseStringEvaluator interface](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.PairwiseStringEvaluator.html#langchain.evaluation.schema.PairwiseStringEvaluator) for more info.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "93f3a653-d198-4291-973c-8d1adba338b2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from typing import Optional, Any\n",
-    "from langchain.evaluation import PairwiseStringEvaluator\n",
-    "\n",
-    "\n",
-    "class LengthComparisonPairwiseEvalutor(PairwiseStringEvaluator):\n",
-    "    \"\"\"\n",
-    "    Custom evaluator to compare two strings.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def _evaluate_string_pairs(\n",
-    "        self,\n",
-    "        *,\n",
-    "        prediction: str,\n",
-    "        prediction_b: str,\n",
-    "        reference: Optional[str] = None,\n",
-    "        input: Optional[str] = None,\n",
-    "        **kwargs: Any,\n",
-    "    ) -> dict:\n",
-    "        score = int(len(prediction.split()) > len(prediction_b.split()))\n",
-    "        return {\"score\": score}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "7d4a77c3-07a7-4076-8e7f-f9bca0d6c290",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator = LengthComparisonPairwiseEvalutor()\n",
-    "\n",
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"The quick brown fox jumped over the lazy dog.\",\n",
-    "    prediction_b=\"The quick brown fox jumped over the dog.\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d90f128f-6f49-42a1-b05a-3aea568ee03b",
-   "metadata": {},
-   "source": [
-    "## LLM-Based Example\n",
-    "\n",
-    "That example was simple to illustrate the API, but it wasn't very useful in practice. Below, use an LLM with some custom instructions to form a simple preference scorer similar to the built-in [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain). We will use `ChatAnthropic` for the evaluator chain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "b4b43098-4d96-417b-a8a9-b3e75779cfe8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install anthropic\n",
-    "# %env ANTHROPIC_API_KEY=YOUR_API_KEY"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from typing import Optional, Any\n",
-    "from langchain.evaluation import PairwiseStringEvaluator\n",
-    "from langchain.chat_models import ChatAnthropic\n",
-    "from langchain.chains import LLMChain\n",
-    "\n",
-    "\n",
-    "class CustomPreferenceEvaluator(PairwiseStringEvaluator):\n",
-    "    \"\"\"\n",
-    "    Custom evaluator to compare two strings using a custom LLMChain.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self) -> None:\n",
-    "        llm = ChatAnthropic(model=\"claude-2\", temperature=0)\n",
-    "        self.eval_chain = LLMChain.from_string(\n",
-    "            llm,\n",
-    "            \"\"\"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
-    "\n",
-    "Input: How do I get the path of the parent directory in python 3.8?\n",
-    "Option A: You can use the following code:\n",
-    "```python\n",
-    "import os\n",
-    "\n",
-    "os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
-    "```\n",
-    "Option B: You can use the following code:\n",
-    "```python\n",
-    "from pathlib import Path\n",
-    "Path(__file__).absolute().parent\n",
-    "```\n",
-    "Reasoning: Both options return the same result. However, since option B is more concise and easily understand, it is preferred.\n",
-    "Preference: B\n",
-    "\n",
-    "Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
-    "Input: {input}\n",
-    "Option A: {prediction}\n",
-    "Option B: {prediction_b}\n",
-    "Reasoning:\"\"\",\n",
-    "        )\n",
-    "\n",
-    "    @property\n",
-    "    def requires_input(self) -> bool:\n",
-    "        return True\n",
-    "\n",
-    "    @property\n",
-    "    def requires_reference(self) -> bool:\n",
-    "        return False\n",
-    "\n",
-    "    def _evaluate_string_pairs(\n",
-    "        self,\n",
-    "        *,\n",
-    "        prediction: str,\n",
-    "        prediction_b: str,\n",
-    "        reference: Optional[str] = None,\n",
-    "        input: Optional[str] = None,\n",
-    "        **kwargs: Any,\n",
-    "    ) -> dict:\n",
-    "        result = self.eval_chain(\n",
-    "            {\n",
-    "                \"input\": input,\n",
-    "                \"prediction\": prediction,\n",
-    "                \"prediction_b\": prediction_b,\n",
-    "                \"stop\": [\"Which option is preferred?\"],\n",
-    "            },\n",
-    "            **kwargs,\n",
-    "        )\n",
-    "\n",
-    "        response_text = result[\"text\"]\n",
-    "        reasoning, preference = response_text.split(\"Preference:\", maxsplit=1)\n",
-    "        preference = preference.strip()\n",
-    "        score = 1.0 if preference == \"A\" else (0.0 if preference == \"B\" else None)\n",
-    "        return {\"reasoning\": reasoning.strip(), \"value\": preference, \"score\": score}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "evaluator = CustomPreferenceEvaluator()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Option B is preferred over option A for importing from a relative directory, because it is more straightforward and concise.\\n\\nOption A uses the importlib module, which allows importing a module by specifying the full name as a string. While this works, it is less clear compared to option B.\\n\\nOption B directly imports from the relative path using dot notation, which clearly shows that it is a relative import. This is the recommended way to do relative imports in Python.\\n\\nIn summary, option B is more accurate and helpful as it uses the standard Python relative import syntax.',\n",
-       " 'value': 'B',\n",
-       " 'score': 0.0}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    input=\"How do I import from a relative directory?\",\n",
-    "    prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
-    "    prediction_b=\"from .sibling import foo\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "f13a1346-7dbe-451d-b3a3-99e8fc7b753b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CustomPreferenceEvaluator requires an input string.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Setting requires_input to return True adds additional validation to avoid returning a grade when insufficient data is provided to the chain.\n",
-    "\n",
-    "try:\n",
-    "    evaluator.evaluate_string_pairs(\n",
-    "        prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
-    "        prediction_b=\"from .sibling import foo\",\n",
-    "    )\n",
-    "except ValueError as e:\n",
-    "    print(e)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e7829cc3-ebd1-4628-ae97-15166202e9cc",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
+            "metadata": {},
+            "source": [
+                "# Custom Pairwise Evaluator\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/custom.ipynb)\n",
+                "\n",
+                "You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
+                "\n",
+                "In this example, you will make a simple custom evaluator that just returns whether the first prediction has more whitespace tokenized 'words' than the second.\n",
+                "\n",
+                "You can check out the reference docs for the [PairwiseStringEvaluator interface](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.PairwiseStringEvaluator.html#langchain.evaluation.schema.PairwiseStringEvaluator) for more info.\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "93f3a653-d198-4291-973c-8d1adba338b2",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from typing import Optional, Any\n",
+                "from langchain.evaluation import PairwiseStringEvaluator\n",
+                "\n",
+                "\n",
+                "class LengthComparisonPairwiseEvalutor(PairwiseStringEvaluator):\n",
+                "    \"\"\"\n",
+                "    Custom evaluator to compare two strings.\n",
+                "    \"\"\"\n",
+                "\n",
+                "    def _evaluate_string_pairs(\n",
+                "        self,\n",
+                "        *,\n",
+                "        prediction: str,\n",
+                "        prediction_b: str,\n",
+                "        reference: Optional[str] = None,\n",
+                "        input: Optional[str] = None,\n",
+                "        **kwargs: Any,\n",
+                "    ) -> dict:\n",
+                "        score = int(len(prediction.split()) > len(prediction_b.split()))\n",
+                "        return {\"score\": score}"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "7d4a77c3-07a7-4076-8e7f-f9bca0d6c290",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 2,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator = LengthComparisonPairwiseEvalutor()\n",
+                "\n",
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"The quick brown fox jumped over the lazy dog.\",\n",
+                "    prediction_b=\"The quick brown fox jumped over the dog.\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "d90f128f-6f49-42a1-b05a-3aea568ee03b",
+            "metadata": {},
+            "source": [
+                "## LLM-Based Example\n",
+                "\n",
+                "That example was simple to illustrate the API, but it wasn't very useful in practice. Below, use an LLM with some custom instructions to form a simple preference scorer similar to the built-in [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain). We will use `ChatAnthropic` for the evaluator chain."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "b4b43098-4d96-417b-a8a9-b3e75779cfe8",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# %pip install anthropic\n",
+                "# %env ANTHROPIC_API_KEY=YOUR_API_KEY"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from typing import Optional, Any\n",
+                "from langchain.evaluation import PairwiseStringEvaluator\n",
+                "from langchain.chat_models import ChatAnthropic\n",
+                "from langchain.chains import LLMChain\n",
+                "\n",
+                "\n",
+                "class CustomPreferenceEvaluator(PairwiseStringEvaluator):\n",
+                "    \"\"\"\n",
+                "    Custom evaluator to compare two strings using a custom LLMChain.\n",
+                "    \"\"\"\n",
+                "\n",
+                "    def __init__(self) -> None:\n",
+                "        llm = ChatAnthropic(model=\"claude-2\", temperature=0)\n",
+                "        self.eval_chain = LLMChain.from_string(\n",
+                "            llm,\n",
+                "            \"\"\"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
+                "\n",
+                "Input: How do I get the path of the parent directory in python 3.8?\n",
+                "Option A: You can use the following code:\n",
+                "```python\n",
+                "import os\n",
+                "\n",
+                "os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
+                "```\n",
+                "Option B: You can use the following code:\n",
+                "```python\n",
+                "from pathlib import Path\n",
+                "Path(__file__).absolute().parent\n",
+                "```\n",
+                "Reasoning: Both options return the same result. However, since option B is more concise and easily understand, it is preferred.\n",
+                "Preference: B\n",
+                "\n",
+                "Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
+                "Input: {input}\n",
+                "Option A: {prediction}\n",
+                "Option B: {prediction_b}\n",
+                "Reasoning:\"\"\",\n",
+                "        )\n",
+                "\n",
+                "    @property\n",
+                "    def requires_input(self) -> bool:\n",
+                "        return True\n",
+                "\n",
+                "    @property\n",
+                "    def requires_reference(self) -> bool:\n",
+                "        return False\n",
+                "\n",
+                "    def _evaluate_string_pairs(\n",
+                "        self,\n",
+                "        *,\n",
+                "        prediction: str,\n",
+                "        prediction_b: str,\n",
+                "        reference: Optional[str] = None,\n",
+                "        input: Optional[str] = None,\n",
+                "        **kwargs: Any,\n",
+                "    ) -> dict:\n",
+                "        result = self.eval_chain(\n",
+                "            {\n",
+                "                \"input\": input,\n",
+                "                \"prediction\": prediction,\n",
+                "                \"prediction_b\": prediction_b,\n",
+                "                \"stop\": [\"Which option is preferred?\"],\n",
+                "            },\n",
+                "            **kwargs,\n",
+                "        )\n",
+                "\n",
+                "        response_text = result[\"text\"]\n",
+                "        reasoning, preference = response_text.split(\"Preference:\", maxsplit=1)\n",
+                "        preference = preference.strip()\n",
+                "        score = 1.0 if preference == \"A\" else (0.0 if preference == \"B\" else None)\n",
+                "        return {\"reasoning\": reasoning.strip(), \"value\": preference, \"score\": score}"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "evaluator = CustomPreferenceEvaluator()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Option B is preferred over option A for importing from a relative directory, because it is more straightforward and concise.\\n\\nOption A uses the importlib module, which allows importing a module by specifying the full name as a string. While this works, it is less clear compared to option B.\\n\\nOption B directly imports from the relative path using dot notation, which clearly shows that it is a relative import. This is the recommended way to do relative imports in Python.\\n\\nIn summary, option B is more accurate and helpful as it uses the standard Python relative import syntax.',\n",
+                            " 'value': 'B',\n",
+                            " 'score': 0.0}"
+                        ]
+                    },
+                    "execution_count": 7,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    input=\"How do I import from a relative directory?\",\n",
+                "    prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
+                "    prediction_b=\"from .sibling import foo\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 13,
+            "id": "f13a1346-7dbe-451d-b3a3-99e8fc7b753b",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "CustomPreferenceEvaluator requires an input string.\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# Setting requires_input to return True adds additional validation to avoid returning a grade when insufficient data is provided to the chain.\n",
+                "\n",
+                "try:\n",
+                "    evaluator.evaluate_string_pairs(\n",
+                "        prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
+                "        prediction_b=\"from .sibling import foo\",\n",
+                "    )\n",
+                "except ValueError as e:\n",
+                "    print(e)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "e7829cc3-ebd1-4628-ae97-15166202e9cc",
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
 }
--- a/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb
+++ b/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb
@ -1,232 +1,233 @@
 {
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Pairwise Embedding Distance \n",
-    "\n",
-    "One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
-    "\n",
-    "You can load the `pairwise_embedding_distance` evaluator to do this.\n",
-    "\n",
-    "**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the outputs are, according to their embedded representation.\n",
-    "\n",
-    "Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"pairwise_embedding_distance\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.0966466944859925}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.03761174337464557}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Select the Distance Metric\n",
-    "\n",
-    "By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<EmbeddingDistance.COSINE: 'cosine'>,\n",
-       " <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
-       " <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
-       " <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
-       " <EmbeddingDistance.HAMMING: 'hamming'>]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation import EmbeddingDistance\n",
-    "\n",
-    "list(EmbeddingDistance)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "evaluator = load_evaluator(\n",
-    "    \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Select Embeddings to Use\n",
-    "\n",
-    "The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "\n",
-    "embedding_model = HuggingFaceEmbeddings()\n",
-    "hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.5486443280477362}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "hf_evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.21018880025138598}"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "hf_evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
+    "cells": [
+        {
+            "attachments": {},
+            "cell_type": "markdown",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "# Pairwise Embedding Distance \n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb)\n",
+                "\n",
+                "One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
+                "\n",
+                "You can load the `pairwise_embedding_distance` evaluator to do this.\n",
+                "\n",
+                "**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the outputs are, according to their embedded representation.\n",
+                "\n",
+                "Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"pairwise_embedding_distance\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.0966466944859925}"
+                        ]
+                    },
+                    "execution_count": 2,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.03761174337464557}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Select the Distance Metric\n",
+                "\n",
+                "By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "[<EmbeddingDistance.COSINE: 'cosine'>,\n",
+                            " <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
+                            " <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
+                            " <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
+                            " <EmbeddingDistance.HAMMING: 'hamming'>]"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from langchain.evaluation import EmbeddingDistance\n",
+                "\n",
+                "list(EmbeddingDistance)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "evaluator = load_evaluator(\n",
+                "    \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Select Embeddings to Use\n",
+                "\n",
+                "The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.embeddings import HuggingFaceEmbeddings\n",
+                "\n",
+                "embedding_model = HuggingFaceEmbeddings()\n",
+                "hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.5486443280477362}"
+                        ]
+                    },
+                    "execution_count": 10,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "hf_evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 12,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.21018880025138598}"
+                        ]
+                    },
+                    "execution_count": 12,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "hf_evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 4
+}
--- a/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb
+++ b/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb
@ -1,381 +1,382 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "2da95378",
-   "metadata": {},
-   "source": [
-    "# Pairwise String Comparison\n",
-    "\n",
-    "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
-    "\n",
-    "- Which LLM or prompt produces a preferred output for a given question?\n",
-    "- Which examples should I include for few-shot example selection?\n",
-    "- Which output is better to include for fintetuning?\n",
-    "\n",
-    "The simplest and often most reliable automated way to choose a preferred prediction for a given input is to use the `pairwise_string` evaluator.\n",
-    "\n",
-    "Check out the reference docs for the [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain) for more info."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "f6790c46",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"labeled_pairwise_string\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "49ad9139",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Both responses are relevant to the question asked, as they both provide a numerical answer to the question about the number of dogs in the park. However, Response A is incorrect according to the reference answer, which states that there are four dogs. Response B, on the other hand, is correct as it matches the reference answer. Neither response demonstrates depth of thought, as they both simply provide a numerical answer without any additional information or context. \\n\\nBased on these criteria, Response B is the better response.\\n',\n",
-       " 'value': 'B',\n",
-       " 'score': 0}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"there are three dogs\",\n",
-    "    prediction_b=\"4\",\n",
-    "    input=\"how many dogs are in the park?\",\n",
-    "    reference=\"four\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7491d2e6-4e77-4b17-be6b-7da966785c1d",
-   "metadata": {},
-   "source": [
-    "## Methods\n",
-    "\n",
-    "\n",
-    "The pairwise string evaluator can be called using [evaluate_string_pairs](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.evaluate_string_pairs) (or async [aevaluate_string_pairs](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.aevaluate_string_pairs)) methods, which accept:\n",
-    "\n",
-    "- prediction (str) – The predicted response of the first model, chain, or prompt.\n",
-    "- prediction_b (str) – The predicted response of the second model, chain, or prompt.\n",
-    "- input (str) – The input question, prompt, or other text.\n",
-    "- reference (str) – (Only for the labeled_pairwise_string variant) The reference response.\n",
-    "\n",
-    "They return a dictionary with the following values:\n",
-    "- value: 'A' or 'B', indicating whether `prediction` or `prediction_b` is preferred, respectively\n",
-    "- score: Integer 0 or 1 mapped from the 'value', where a score of 1 would mean that the first `prediction` is preferred, and a score of 0 would mean `prediction_b` is preferred.\n",
-    "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ed353b93-be71-4479-b9c0-8c97814c2e58",
-   "metadata": {},
-   "source": [
-    "## Without References\n",
-    "\n",
-    "When references aren't available, you can still predict the preferred response.\n",
-    "The results will reflect the evaluation model's preference, which is less reliable and may result\n",
-    "in preferences that are factually incorrect."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "586320da",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"pairwise_string\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Both responses are correct and relevant to the question. However, Response B is more helpful and insightful as it provides a more detailed explanation of what addition is. Response A is correct but lacks depth as it does not explain what the operation of addition entails. \\n\\nFinal Decision: [[B]]',\n",
-       " 'value': 'B',\n",
-       " 'score': 0}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Addition is a mathematical operation.\",\n",
-    "    prediction_b=\"Addition is a mathematical operation that adds two numbers to create a third number, the 'sum'.\",\n",
-    "    input=\"What is addition?\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4a09b21d-9851-47e8-93d3-90044b2945b0",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Defining the Criteria\n",
-    "\n",
-    "By default, the LLM is instructed to select the 'preferred' response based on helpfulness, relevance, correctness, and depth of thought. You can customize the criteria by passing in a `criteria` argument, where the criteria could take any of the following forms:\n",
-    "- [`Criteria`](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.Criteria.html#langchain.evaluation.criteria.eval_chain.Criteria) enum or its string value - to use one of the default criteria and their descriptions\n",
-    "- [Constitutional principal](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.models.ConstitutionalPrinciple.html#langchain.chains.constitutional_ai.models.ConstitutionalPrinciple) - use one any of the constitutional principles defined in langchain\n",
-    "- Dictionary: a list of custom criteria, where the key is the name of the criteria, and the value is the description.\n",
-    "- A list of criteria or constitutional principles - to combine multiple criteria in one.\n",
-    "\n",
-    "Below is an example for determining preferred writing responses based on a custom style."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "8539e7d9-f7b0-4d32-9c45-593a7915c093",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "custom_criteria = {\n",
-    "    \"simplicity\": \"Is the language straightforward and unpretentious?\",\n",
-    "    \"clarity\": \"Are the sentences clear and easy to understand?\",\n",
-    "    \"precision\": \"Is the writing precise, with no unnecessary words or details?\",\n",
-    "    \"truthfulness\": \"Does the writing feel honest and sincere?\",\n",
-    "    \"subtext\": \"Does the writing suggest deeper meanings or themes?\",\n",
-    "}\n",
-    "evaluator = load_evaluator(\"pairwise_string\", criteria=custom_criteria)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "fec7bde8-fbdc-4730-8366-9d90d033c181",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Response A is simple, clear, and precise. It uses straightforward language to convey a deep and sincere message about families. The metaphor of joy and sorrow as music is effective and easy to understand.\\n\\nResponse B, on the other hand, is more complex and less clear. The language is more pretentious, with words like \"domicile,\" \"resounds,\" \"abode,\" \"dissonant,\" and \"elegy.\" While it conveys a similar message to Response A, it does so in a more convoluted way. The precision is also lacking due to the use of unnecessary words and details.\\n\\nBoth responses suggest deeper meanings or themes about the shared joy and unique sorrow in families. However, Response A does so in a more effective and accessible way.\\n\\nTherefore, the better response is [[A]].',\n",
-       " 'value': 'A',\n",
-       " 'score': 1}"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"Every cheerful household shares a similar rhythm of joy; but sorrow, in each household, plays a unique, haunting melody.\",\n",
-    "    prediction_b=\"Where one finds a symphony of joy, every domicile of happiness resounds in harmonious,\"\n",
-    "    \" identical notes; yet, every abode of despair conducts a dissonant orchestra, each\"\n",
-    "    \" playing an elegy of grief that is peculiar and profound to its own existence.\",\n",
-    "    input=\"Write some prose about families.\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a25b60b2-627c-408a-be4b-a2e5cbc10726",
-   "metadata": {},
-   "source": [
-    "## Customize the LLM\n",
-    "\n",
-    "By default, the loader uses `gpt-4` in the evaluation chain. You can customize this when loading."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "de84a958-1330-482b-b950-68bcf23f9e35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatAnthropic\n",
-    "\n",
-    "llm = ChatAnthropic(temperature=0)\n",
-    "\n",
-    "evaluator = load_evaluator(\"labeled_pairwise_string\", llm=llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "e162153f-d50a-4a7c-a033-019dabbc954c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Here is my assessment:\\n\\nResponse B is more helpful, insightful, and accurate than Response A. Response B simply states \"4\", which directly answers the question by providing the exact number of dogs mentioned in the reference answer. In contrast, Response A states \"there are three dogs\", which is incorrect according to the reference answer. \\n\\nIn terms of helpfulness, Response B gives the precise number while Response A provides an inaccurate guess. For relevance, both refer to dogs in the park from the question. However, Response B is more correct and factual based on the reference answer. Response A shows some attempt at reasoning but is ultimately incorrect. Response B requires less depth of thought to simply state the factual number.\\n\\nIn summary, Response B is superior in terms of helpfulness, relevance, correctness, and depth. My final decision is: [[B]]\\n',\n",
-       " 'value': 'B',\n",
-       " 'score': 0}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"there are three dogs\",\n",
-    "    prediction_b=\"4\",\n",
-    "    input=\"how many dogs are in the park?\",\n",
-    "    reference=\"four\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e0e89c13-d0ad-4f87-8fcb-814399bafa2a",
-   "metadata": {},
-   "source": [
-    "## Customize the Evaluation Prompt\n",
-    "\n",
-    "You can use your own custom evaluation prompt to add more task-specific instructions or to instruct the evaluator to score the output.\n",
-    "\n",
-    "*Note: If you use a prompt that expects generates a result in a unique format, you may also have to pass in a custom output parser (`output_parser=your_parser()`) instead of the default `PairwiseStringResultOutputParser`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "fb817efa-3a4d-439d-af8c-773b89d97ec9",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "prompt_template = PromptTemplate.from_template(\n",
-    "    \"\"\"Given the input context, which do you prefer: A or B?\n",
-    "Evaluate based on the following criteria:\n",
-    "{criteria}\n",
-    "Reason step by step and finally, respond with either [[A]] or [[B]] on its own line.\n",
-    "\n",
-    "DATA\n",
-    "----\n",
-    "input: {input}\n",
-    "reference: {reference}\n",
-    "A: {prediction}\n",
-    "B: {prediction_b}\n",
-    "---\n",
-    "Reasoning:\n",
-    "\n",
-    "\"\"\"\n",
-    ")\n",
-    "evaluator = load_evaluator(\n",
-    "    \"labeled_pairwise_string\", prompt=prompt_template\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "d40aa4f0-cfd5-4cb4-83c8-8d2300a04c2f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "input_variables=['prediction', 'reference', 'prediction_b', 'input'] output_parser=None partial_variables={'criteria': 'helpfulness: Is the submission helpful, insightful, and appropriate?\\nrelevance: Is the submission referring to a real quote from the text?\\ncorrectness: Is the submission correct, accurate, and factual?\\ndepth: Does the submission demonstrate depth of thought?'} template='Given the input context, which do you prefer: A or B?\\nEvaluate based on the following criteria:\\n{criteria}\\nReason step by step and finally, respond with either [[A]] or [[B]] on its own line.\\n\\nDATA\\n----\\ninput: {input}\\nreference: {reference}\\nA: {prediction}\\nB: {prediction_b}\\n---\\nReasoning:\\n\\n' template_format='f-string' validate_template=True\n"
-     ]
-    }
-   ],
-   "source": [
-    "# The prompt was assigned to the evaluator\n",
-    "print(evaluator.prompt)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "9467bb42-7a31-4071-8f66-9ed2c6f06dcd",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'reasoning': 'Helpfulness: Both A and B are helpful as they provide a direct answer to the question.\\nRelevance: A is relevant as it refers to the correct name of the dog from the text. B is not relevant as it provides a different name.\\nCorrectness: A is correct as it accurately states the name of the dog. B is incorrect as it provides a different name.\\nDepth: Both A and B demonstrate a similar level of depth as they both provide a straightforward answer to the question.\\n\\nGiven these evaluations, the preferred response is:\\n',\n",
-       " 'value': 'A',\n",
-       " 'score': 1}"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_string_pairs(\n",
-    "    prediction=\"The dog that ate the ice cream was named fido.\",\n",
-    "    prediction_b=\"The dog's name is spot\",\n",
-    "    input=\"What is the name of the dog that ate the ice cream?\",\n",
-    "    reference=\"The dog's name is fido\",\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "2da95378",
+            "metadata": {},
+            "source": [
+                "# Pairwise String Comparison\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb)\n",
+                "\n",
+                "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
+                "\n",
+                "- Which LLM or prompt produces a preferred output for a given question?\n",
+                "- Which examples should I include for few-shot example selection?\n",
+                "- Which output is better to include for fintetuning?\n",
+                "\n",
+                "The simplest and often most reliable automated way to choose a preferred prediction for a given input is to use the `pairwise_string` evaluator.\n",
+                "\n",
+                "Check out the reference docs for the [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain) for more info."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "f6790c46",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"labeled_pairwise_string\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "49ad9139",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Both responses are relevant to the question asked, as they both provide a numerical answer to the question about the number of dogs in the park. However, Response A is incorrect according to the reference answer, which states that there are four dogs. Response B, on the other hand, is correct as it matches the reference answer. Neither response demonstrates depth of thought, as they both simply provide a numerical answer without any additional information or context. \\n\\nBased on these criteria, Response B is the better response.\\n',\n",
+                            " 'value': 'B',\n",
+                            " 'score': 0}"
+                        ]
+                    },
+                    "execution_count": 2,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"there are three dogs\",\n",
+                "    prediction_b=\"4\",\n",
+                "    input=\"how many dogs are in the park?\",\n",
+                "    reference=\"four\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "7491d2e6-4e77-4b17-be6b-7da966785c1d",
+            "metadata": {},
+            "source": [
+                "## Methods\n",
+                "\n",
+                "\n",
+                "The pairwise string evaluator can be called using [evaluate_string_pairs](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.evaluate_string_pairs) (or async [aevaluate_string_pairs](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.aevaluate_string_pairs)) methods, which accept:\n",
+                "\n",
+                "- prediction (str) – The predicted response of the first model, chain, or prompt.\n",
+                "- prediction_b (str) – The predicted response of the second model, chain, or prompt.\n",
+                "- input (str) – The input question, prompt, or other text.\n",
+                "- reference (str) – (Only for the labeled_pairwise_string variant) The reference response.\n",
+                "\n",
+                "They return a dictionary with the following values:\n",
+                "- value: 'A' or 'B', indicating whether `prediction` or `prediction_b` is preferred, respectively\n",
+                "- score: Integer 0 or 1 mapped from the 'value', where a score of 1 would mean that the first `prediction` is preferred, and a score of 0 would mean `prediction_b` is preferred.\n",
+                "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "ed353b93-be71-4479-b9c0-8c97814c2e58",
+            "metadata": {},
+            "source": [
+                "## Without References\n",
+                "\n",
+                "When references aren't available, you can still predict the preferred response.\n",
+                "The results will reflect the evaluation model's preference, which is less reliable and may result\n",
+                "in preferences that are factually incorrect."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "586320da",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"pairwise_string\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Both responses are correct and relevant to the question. However, Response B is more helpful and insightful as it provides a more detailed explanation of what addition is. Response A is correct but lacks depth as it does not explain what the operation of addition entails. \\n\\nFinal Decision: [[B]]',\n",
+                            " 'value': 'B',\n",
+                            " 'score': 0}"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Addition is a mathematical operation.\",\n",
+                "    prediction_b=\"Addition is a mathematical operation that adds two numbers to create a third number, the 'sum'.\",\n",
+                "    input=\"What is addition?\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "4a09b21d-9851-47e8-93d3-90044b2945b0",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "## Defining the Criteria\n",
+                "\n",
+                "By default, the LLM is instructed to select the 'preferred' response based on helpfulness, relevance, correctness, and depth of thought. You can customize the criteria by passing in a `criteria` argument, where the criteria could take any of the following forms:\n",
+                "- [`Criteria`](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.Criteria.html#langchain.evaluation.criteria.eval_chain.Criteria) enum or its string value - to use one of the default criteria and their descriptions\n",
+                "- [Constitutional principal](https://api.python.langchain.com/en/latest/chains/langchain.chains.constitutional_ai.models.ConstitutionalPrinciple.html#langchain.chains.constitutional_ai.models.ConstitutionalPrinciple) - use one any of the constitutional principles defined in langchain\n",
+                "- Dictionary: a list of custom criteria, where the key is the name of the criteria, and the value is the description.\n",
+                "- A list of criteria or constitutional principles - to combine multiple criteria in one.\n",
+                "\n",
+                "Below is an example for determining preferred writing responses based on a custom style."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "id": "8539e7d9-f7b0-4d32-9c45-593a7915c093",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "custom_criteria = {\n",
+                "    \"simplicity\": \"Is the language straightforward and unpretentious?\",\n",
+                "    \"clarity\": \"Are the sentences clear and easy to understand?\",\n",
+                "    \"precision\": \"Is the writing precise, with no unnecessary words or details?\",\n",
+                "    \"truthfulness\": \"Does the writing feel honest and sincere?\",\n",
+                "    \"subtext\": \"Does the writing suggest deeper meanings or themes?\",\n",
+                "}\n",
+                "evaluator = load_evaluator(\"pairwise_string\", criteria=custom_criteria)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "fec7bde8-fbdc-4730-8366-9d90d033c181",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Response A is simple, clear, and precise. It uses straightforward language to convey a deep and sincere message about families. The metaphor of joy and sorrow as music is effective and easy to understand.\\n\\nResponse B, on the other hand, is more complex and less clear. The language is more pretentious, with words like \"domicile,\" \"resounds,\" \"abode,\" \"dissonant,\" and \"elegy.\" While it conveys a similar message to Response A, it does so in a more convoluted way. The precision is also lacking due to the use of unnecessary words and details.\\n\\nBoth responses suggest deeper meanings or themes about the shared joy and unique sorrow in families. However, Response A does so in a more effective and accessible way.\\n\\nTherefore, the better response is [[A]].',\n",
+                            " 'value': 'A',\n",
+                            " 'score': 1}"
+                        ]
+                    },
+                    "execution_count": 6,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"Every cheerful household shares a similar rhythm of joy; but sorrow, in each household, plays a unique, haunting melody.\",\n",
+                "    prediction_b=\"Where one finds a symphony of joy, every domicile of happiness resounds in harmonious,\"\n",
+                "    \" identical notes; yet, every abode of despair conducts a dissonant orchestra, each\"\n",
+                "    \" playing an elegy of grief that is peculiar and profound to its own existence.\",\n",
+                "    input=\"Write some prose about families.\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "a25b60b2-627c-408a-be4b-a2e5cbc10726",
+            "metadata": {},
+            "source": [
+                "## Customize the LLM\n",
+                "\n",
+                "By default, the loader uses `gpt-4` in the evaluation chain. You can customize this when loading."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "id": "de84a958-1330-482b-b950-68bcf23f9e35",
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from langchain.chat_models import ChatAnthropic\n",
+                "\n",
+                "llm = ChatAnthropic(temperature=0)\n",
+                "\n",
+                "evaluator = load_evaluator(\"labeled_pairwise_string\", llm=llm)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "id": "e162153f-d50a-4a7c-a033-019dabbc954c",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Here is my assessment:\\n\\nResponse B is more helpful, insightful, and accurate than Response A. Response B simply states \"4\", which directly answers the question by providing the exact number of dogs mentioned in the reference answer. In contrast, Response A states \"there are three dogs\", which is incorrect according to the reference answer. \\n\\nIn terms of helpfulness, Response B gives the precise number while Response A provides an inaccurate guess. For relevance, both refer to dogs in the park from the question. However, Response B is more correct and factual based on the reference answer. Response A shows some attempt at reasoning but is ultimately incorrect. Response B requires less depth of thought to simply state the factual number.\\n\\nIn summary, Response B is superior in terms of helpfulness, relevance, correctness, and depth. My final decision is: [[B]]\\n',\n",
+                            " 'value': 'B',\n",
+                            " 'score': 0}"
+                        ]
+                    },
+                    "execution_count": 8,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"there are three dogs\",\n",
+                "    prediction_b=\"4\",\n",
+                "    input=\"how many dogs are in the park?\",\n",
+                "    reference=\"four\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "e0e89c13-d0ad-4f87-8fcb-814399bafa2a",
+            "metadata": {},
+            "source": [
+                "## Customize the Evaluation Prompt\n",
+                "\n",
+                "You can use your own custom evaluation prompt to add more task-specific instructions or to instruct the evaluator to score the output.\n",
+                "\n",
+                "*Note: If you use a prompt that expects generates a result in a unique format, you may also have to pass in a custom output parser (`output_parser=your_parser()`) instead of the default `PairwiseStringResultOutputParser`"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 9,
+            "id": "fb817efa-3a4d-439d-af8c-773b89d97ec9",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.prompts import PromptTemplate\n",
+                "\n",
+                "prompt_template = PromptTemplate.from_template(\n",
+                "    \"\"\"Given the input context, which do you prefer: A or B?\n",
+                "Evaluate based on the following criteria:\n",
+                "{criteria}\n",
+                "Reason step by step and finally, respond with either [[A]] or [[B]] on its own line.\n",
+                "\n",
+                "DATA\n",
+                "----\n",
+                "input: {input}\n",
+                "reference: {reference}\n",
+                "A: {prediction}\n",
+                "B: {prediction_b}\n",
+                "---\n",
+                "Reasoning:\n",
+                "\n",
+                "\"\"\"\n",
+                ")\n",
+                "evaluator = load_evaluator(\n",
+                "    \"labeled_pairwise_string\", prompt=prompt_template\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "id": "d40aa4f0-cfd5-4cb4-83c8-8d2300a04c2f",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "input_variables=['prediction', 'reference', 'prediction_b', 'input'] output_parser=None partial_variables={'criteria': 'helpfulness: Is the submission helpful, insightful, and appropriate?\\nrelevance: Is the submission referring to a real quote from the text?\\ncorrectness: Is the submission correct, accurate, and factual?\\ndepth: Does the submission demonstrate depth of thought?'} template='Given the input context, which do you prefer: A or B?\\nEvaluate based on the following criteria:\\n{criteria}\\nReason step by step and finally, respond with either [[A]] or [[B]] on its own line.\\n\\nDATA\\n----\\ninput: {input}\\nreference: {reference}\\nA: {prediction}\\nB: {prediction_b}\\n---\\nReasoning:\\n\\n' template_format='f-string' validate_template=True\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# The prompt was assigned to the evaluator\n",
+                "print(evaluator.prompt)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "id": "9467bb42-7a31-4071-8f66-9ed2c6f06dcd",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'reasoning': 'Helpfulness: Both A and B are helpful as they provide a direct answer to the question.\\nRelevance: A is relevant as it refers to the correct name of the dog from the text. B is not relevant as it provides a different name.\\nCorrectness: A is correct as it accurately states the name of the dog. B is incorrect as it provides a different name.\\nDepth: Both A and B demonstrate a similar level of depth as they both provide a straightforward answer to the question.\\n\\nGiven these evaluations, the preferred response is:\\n',\n",
+                            " 'value': 'A',\n",
+                            " 'score': 1}"
+                        ]
+                    },
+                    "execution_count": 11,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_string_pairs(\n",
+                "    prediction=\"The dog that ate the ice cream was named fido.\",\n",
+                "    prediction_b=\"The dog's name is spot\",\n",
+                "    input=\"What is the name of the dog that ate the ice cream?\",\n",
+                "    reference=\"The dog's name is fido\",\n",
+                ")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/examples/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/examples/comparisons.ipynb
@ -1,447 +1,448 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Comparing Chain Outputs\n",
-    "\n",
-    "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
-    "\n",
-    "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
-    "\n",
-    "For this evaluation, we will need 3 things:\n",
-    "1. An evaluator\n",
-    "2. A dataset of inputs\n",
-    "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
-    "\n",
-    "Then we will aggregate the restults to determine the preferred model.\n",
-    "\n",
-    "### Step 1. Create the Evaluator\n",
-    "\n",
-    "In this example, you will use gpt-4 to select which output is preferred."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "eval_chain = load_evaluator(\"pairwise_string\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 2. Select Dataset\n",
-    "\n",
-    "If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
-    "provide more reliable results. We will use some example queries someone might have about how to use langchain here."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
-     ]
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Comparing Chain Outputs\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/examples/comparisons.ipynb)\n",
+                "\n",
+                "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
+                "\n",
+                "One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
+                "\n",
+                "For this evaluation, we will need 3 things:\n",
+                "1. An evaluator\n",
+                "2. A dataset of inputs\n",
+                "3. 2 (or more) LLMs, Chains, or Agents to compare\n",
+                "\n",
+                "Then we will aggregate the restults to determine the preferred model.\n",
+                "\n",
+                "### Step 1. Create the Evaluator\n",
+                "\n",
+                "In this example, you will use gpt-4 to select which output is preferred."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "eval_chain = load_evaluator(\"pairwise_string\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Step 2. Select Dataset\n",
+                "\n",
+                "If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
+                "provide more reliable results. We will use some example queries someone might have about how to use langchain here."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "application/vnd.jupyter.widget-view+json": {
+                            "model_id": "a2358d37246640ce95e0f9940194590a",
+                            "version_major": 2,
+                            "version_minor": 0
+                        },
+                        "text/plain": [
+                            "  0%|          | 0/1 [00:00<?, ?it/s]"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                }
+            ],
+            "source": [
+                "from langchain.evaluation.loading import load_dataset\n",
+                "\n",
+                "dataset = load_dataset(\"langchain-howto-queries\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Step 3. Define Models to Compare\n",
+                "\n",
+                "We will be comparing two agents in this case."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.utilities import SerpAPIWrapper\n",
+                "from langchain.agents import initialize_agent, Tool\n",
+                "from langchain.agents import AgentType\n",
+                "from langchain.chat_models import ChatOpenAI\n",
+                "\n",
+                "\n",
+                "# Initialize the language model\n",
+                "# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
+                "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
+                "\n",
+                "# Initialize the SerpAPIWrapper for search functionality\n",
+                "# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
+                "search = SerpAPIWrapper()\n",
+                "\n",
+                "# Define a list of tools offered by the agent\n",
+                "tools = [\n",
+                "    Tool(\n",
+                "        name=\"Search\",\n",
+                "        func=search.run,\n",
+                "        coroutine=search.arun,\n",
+                "        description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
+                "    ),\n",
+                "]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "functions_agent = initialize_agent(\n",
+                "    tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
+                ")\n",
+                "conversations_agent = initialize_agent(\n",
+                "    tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Step 4. Generate Responses\n",
+                "\n",
+                "We will generate outputs for each of the models before evaluating them."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "application/vnd.jupyter.widget-view+json": {
+                            "model_id": "87277cb39a1a4726bb7cc533a24e2ea4",
+                            "version_major": 2,
+                            "version_minor": 0
+                        },
+                        "text/plain": [
+                            "  0%|          | 0/20 [00:00<?, ?it/s]"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                }
+            ],
+            "source": [
+                "from tqdm.notebook import tqdm\n",
+                "import asyncio\n",
+                "\n",
+                "results = []\n",
+                "agents = [functions_agent, conversations_agent]\n",
+                "concurrency_level = 6  # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
+                "\n",
+                "# We will only run the first 20 examples of this dataset to speed things up\n",
+                "# This will lead to larger confidence intervals downstream.\n",
+                "batch = []\n",
+                "for example in tqdm(dataset[:20]):\n",
+                "    batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
+                "    if len(batch) >= concurrency_level:\n",
+                "        batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
+                "        results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
+                "        batch = []\n",
+                "if batch:\n",
+                "    batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
+                "    results.extend(list(zip(*[iter(batch_results)] * 2)))"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Step 5. Evaluate Pairs\n",
+                "\n",
+                "Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
+                "\n",
+                "Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "import random\n",
+                "\n",
+                "\n",
+                "def predict_preferences(dataset, results) -> list:\n",
+                "    preferences = []\n",
+                "\n",
+                "    for example, (res_a, res_b) in zip(dataset, results):\n",
+                "        input_ = example[\"inputs\"]\n",
+                "        # Flip a coin to reduce persistent position bias\n",
+                "        if random.random() < 0.5:\n",
+                "            pred_a, pred_b = res_a, res_b\n",
+                "            a, b = \"a\", \"b\"\n",
+                "        else:\n",
+                "            pred_a, pred_b = res_b, res_a\n",
+                "            a, b = \"b\", \"a\"\n",
+                "        eval_res = eval_chain.evaluate_string_pairs(\n",
+                "            prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
+                "            prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
+                "            input=input_,\n",
+                "        )\n",
+                "        if eval_res[\"value\"] == \"A\":\n",
+                "            preferences.append(a)\n",
+                "        elif eval_res[\"value\"] == \"B\":\n",
+                "            preferences.append(b)\n",
+                "        else:\n",
+                "            preferences.append(None)  # No preference\n",
+                "    return preferences"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "preferences = predict_preferences(dataset, results)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "**Print out the ratio of preferences.**"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "OpenAI Functions Agent: 95.00%\n",
+                        "None: 5.00%\n"
+                    ]
+                }
+            ],
+            "source": [
+                "from collections import Counter\n",
+                "\n",
+                "name_map = {\n",
+                "    \"a\": \"OpenAI Functions Agent\",\n",
+                "    \"b\": \"Structured Chat Agent\",\n",
+                "}\n",
+                "counts = Counter(preferences)\n",
+                "pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
+                "for k, v in pref_ratios.items():\n",
+                "    print(f\"{name_map.get(k)}: {v:.2%}\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "### Estimate Confidence Intervals\n",
+                "\n",
+                "The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
+                "\n",
+                "Below, use the Wilson score to estimate the confidence interval."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 9,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from math import sqrt\n",
+                "\n",
+                "\n",
+                "def wilson_score_interval(\n",
+                "    preferences: list, which: str = \"a\", z: float = 1.96\n",
+                ") -> tuple:\n",
+                "    \"\"\"Estimate the confidence interval using the Wilson score.\n",
+                "\n",
+                "    See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
+                "    for more details, including when to use it and when it should not be used.\n",
+                "    \"\"\"\n",
+                "    total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
+                "    n_s = preferences.count(which)\n",
+                "\n",
+                "    if total_preferences == 0:\n",
+                "        return (0, 0)\n",
+                "\n",
+                "    p_hat = n_s / total_preferences\n",
+                "\n",
+                "    denominator = 1 + (z**2) / total_preferences\n",
+                "    adjustment = (z / denominator) * sqrt(\n",
+                "        p_hat * (1 - p_hat) / total_preferences\n",
+                "        + (z**2) / (4 * total_preferences * total_preferences)\n",
+                "    )\n",
+                "    center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
+                "    lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
+                "    upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
+                "\n",
+                "    return (lower_bound, upper_bound)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "The \"OpenAI Functions Agent\" would be preferred between 83.18% and 100.00% percent of the time (with 95% confidence).\n",
+                        "The \"Structured Chat Agent\" would be preferred between 0.00% and 16.82% percent of the time (with 95% confidence).\n"
+                    ]
+                }
+            ],
+            "source": [
+                "for which_, name in name_map.items():\n",
+                "    low, high = wilson_score_interval(preferences, which=which_)\n",
+                "    print(\n",
+                "        f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
+                "    )"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "**Print out the p-value.**"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "The p-value is 0.00000. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
+                        "then there is a 0.00038% chance of observing the OpenAI Functions Agent be preferred at least 19\n",
+                        "times out of 19 trials.\n"
+                    ]
+                },
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_15978/384907688.py:6: DeprecationWarning: 'binom_test' is deprecated in favour of 'binomtest' from version 1.7.0 and will be removed in Scipy 1.12.0.\n",
+                        "  p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n"
+                    ]
+                }
+            ],
+            "source": [
+                "from scipy import stats\n",
+                "\n",
+                "preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
+                "successes = preferences.count(preferred_model)\n",
+                "n = len(preferences) - preferences.count(None)\n",
+                "p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
+                "print(\n",
+                "    f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
+                "then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
+                "times out of {n} trials.\"\"\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
+                "LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
+                "In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a2358d37246640ce95e0f9940194590a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"langchain-howto-queries\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 3. Define Models to Compare\n",
-    "\n",
-    "We will be comparing two agents in this case."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.utilities import SerpAPIWrapper\n",
-    "from langchain.agents import initialize_agent, Tool\n",
-    "from langchain.agents import AgentType\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "\n",
-    "\n",
-    "# Initialize the language model\n",
-    "# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
-    "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
-    "\n",
-    "# Initialize the SerpAPIWrapper for search functionality\n",
-    "# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
-    "search = SerpAPIWrapper()\n",
-    "\n",
-    "# Define a list of tools offered by the agent\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"Search\",\n",
-    "        func=search.run,\n",
-    "        coroutine=search.arun,\n",
-    "        description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
-    "    ),\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "functions_agent = initialize_agent(\n",
-    "    tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
-    ")\n",
-    "conversations_agent = initialize_agent(\n",
-    "    tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 4. Generate Responses\n",
-    "\n",
-    "We will generate outputs for each of the models before evaluating them."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "87277cb39a1a4726bb7cc533a24e2ea4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/20 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from tqdm.notebook import tqdm\n",
-    "import asyncio\n",
-    "\n",
-    "results = []\n",
-    "agents = [functions_agent, conversations_agent]\n",
-    "concurrency_level = 6  # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
-    "\n",
-    "# We will only run the first 20 examples of this dataset to speed things up\n",
-    "# This will lead to larger confidence intervals downstream.\n",
-    "batch = []\n",
-    "for example in tqdm(dataset[:20]):\n",
-    "    batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
-    "    if len(batch) >= concurrency_level:\n",
-    "        batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
-    "        results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
-    "        batch = []\n",
-    "if batch:\n",
-    "    batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
-    "    results.extend(list(zip(*[iter(batch_results)] * 2)))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 5. Evaluate Pairs\n",
-    "\n",
-    "Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
-    "\n",
-    "Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "\n",
-    "\n",
-    "def predict_preferences(dataset, results) -> list:\n",
-    "    preferences = []\n",
-    "\n",
-    "    for example, (res_a, res_b) in zip(dataset, results):\n",
-    "        input_ = example[\"inputs\"]\n",
-    "        # Flip a coin to reduce persistent position bias\n",
-    "        if random.random() < 0.5:\n",
-    "            pred_a, pred_b = res_a, res_b\n",
-    "            a, b = \"a\", \"b\"\n",
-    "        else:\n",
-    "            pred_a, pred_b = res_b, res_a\n",
-    "            a, b = \"b\", \"a\"\n",
-    "        eval_res = eval_chain.evaluate_string_pairs(\n",
-    "            prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
-    "            prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
-    "            input=input_,\n",
-    "        )\n",
-    "        if eval_res[\"value\"] == \"A\":\n",
-    "            preferences.append(a)\n",
-    "        elif eval_res[\"value\"] == \"B\":\n",
-    "            preferences.append(b)\n",
-    "        else:\n",
-    "            preferences.append(None)  # No preference\n",
-    "    return preferences"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "preferences = predict_preferences(dataset, results)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "**Print out the ratio of preferences.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OpenAI Functions Agent: 95.00%\n",
-      "None: 5.00%\n"
-     ]
-    }
-   ],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "name_map = {\n",
-    "    \"a\": \"OpenAI Functions Agent\",\n",
-    "    \"b\": \"Structured Chat Agent\",\n",
-    "}\n",
-    "counts = Counter(preferences)\n",
-    "pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
-    "for k, v in pref_ratios.items():\n",
-    "    print(f\"{name_map.get(k)}: {v:.2%}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Estimate Confidence Intervals\n",
-    "\n",
-    "The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
-    "\n",
-    "Below, use the Wilson score to estimate the confidence interval."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from math import sqrt\n",
-    "\n",
-    "\n",
-    "def wilson_score_interval(\n",
-    "    preferences: list, which: str = \"a\", z: float = 1.96\n",
-    ") -> tuple:\n",
-    "    \"\"\"Estimate the confidence interval using the Wilson score.\n",
-    "\n",
-    "    See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
-    "    for more details, including when to use it and when it should not be used.\n",
-    "    \"\"\"\n",
-    "    total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
-    "    n_s = preferences.count(which)\n",
-    "\n",
-    "    if total_preferences == 0:\n",
-    "        return (0, 0)\n",
-    "\n",
-    "    p_hat = n_s / total_preferences\n",
-    "\n",
-    "    denominator = 1 + (z**2) / total_preferences\n",
-    "    adjustment = (z / denominator) * sqrt(\n",
-    "        p_hat * (1 - p_hat) / total_preferences\n",
-    "        + (z**2) / (4 * total_preferences * total_preferences)\n",
-    "    )\n",
-    "    center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
-    "    lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
-    "    upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
-    "\n",
-    "    return (lower_bound, upper_bound)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The \"OpenAI Functions Agent\" would be preferred between 83.18% and 100.00% percent of the time (with 95% confidence).\n",
-      "The \"Structured Chat Agent\" would be preferred between 0.00% and 16.82% percent of the time (with 95% confidence).\n"
-     ]
-    }
-   ],
-   "source": [
-    "for which_, name in name_map.items():\n",
-    "    low, high = wilson_score_interval(preferences, which=which_)\n",
-    "    print(\n",
-    "        f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Print out the p-value.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The p-value is 0.00000. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
-      "then there is a 0.00038% chance of observing the OpenAI Functions Agent be preferred at least 19\n",
-      "times out of 19 trials.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_15978/384907688.py:6: DeprecationWarning: 'binom_test' is deprecated in favour of 'binomtest' from version 1.7.0 and will be removed in Scipy 1.12.0.\n",
-      "  p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n"
-     ]
-    }
-   ],
-   "source": [
-    "from scipy import stats\n",
-    "\n",
-    "preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
-    "successes = preferences.count(preferred_model)\n",
-    "n = len(preferences) - preferences.count(None)\n",
-    "p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
-    "print(\n",
-    "    f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
-    "then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
-    "times out of {n} trials.\"\"\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
-    "LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
-    "In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
+    "nbformat": 4,
+    "nbformat_minor": 4
+}
--- a/docs/extras/guides/evaluation/string/Untitled.ipynb
+++ b/docs/extras/guides/evaluation/string/Untitled.ipynb
@ -1,318 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bce7335e-f3b2-44f3-90cc-8c0a23a89a21",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "from langchain.agents import load_tools\n",
-    "from langchain.agents import initialize_agent\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.utilities import GoogleSearchAPIWrapper\n",
-    "from langchain.schema import (\n",
-    "    SystemMessage,\n",
-    "    HumanMessage,\n",
-    "    AIMessage\n",
-    ")\n",
-    "\n",
-    "# os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
-    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
-    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"******\"\n",
-    "# os.environ[\"LANGCHAIN_PROJECT\"] = \"Jarvis\"\n",
-    "\n",
-    "\n",
-    "prefix_messages = [{\"role\": \"system\", \"content\": \"You are a helpful discord Chatbot.\"}]\n",
-    "\n",
-    "llm = ChatOpenAI(model_name='gpt-3.5-turbo', \n",
-    "             temperature=0.5, \n",
-    "             max_tokens = 2000)\n",
-    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-    "agent = initialize_agent(tools,\n",
-    "                         llm,\n",
-    "                         agent=\"zero-shot-react-description\",\n",
-    "                         verbose=True,\n",
-    "                         handle_parsing_errors=True\n",
-    "                         )\n",
-    "\n",
-    "\n",
-    "async def on_ready():\n",
-    "    print(f'{bot.user} has connected to Discord!')\n",
-    "\n",
-    "async def on_message(message):\n",
-    "\n",
-    "    print(\"Detected bot name in message:\", message.content)\n",
-    "\n",
-    "    # Capture the output of agent.run() in the response variable\n",
-    "    response = agent.run(message.content)\n",
-    "\n",
-    "    while response:\n",
-    "        print(response)\n",
-    "        chunk, response = response[:2000], response[2000:]\n",
-    "        print(f\"Chunk: {chunk}\")\n",
-    "    print(\"Response sent.\")\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "1551ce9f-b6de-4035-b6d6-825722823b48",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from dataclasses import dataclass\n",
-    "@dataclass\n",
-    "class Message:\n",
-    "    content: str"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "6e6859ec-8544-4407-9663-6b53c0092903",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Detected bot name in message: Hi AI, how are you today?\n",
-      "\n",
-      "\n",
-      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3mThis question is not something that can be answered using the available tools.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
-      "Action: N/A\u001b[0m\n",
-      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
-      "Thought:\u001b[32;1m\u001b[1;3m\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "Agent stopped due to iteration limit or time limit.\n",
-      "Chunk: Agent stopped due to iteration limit or time limit.\n",
-      "Response sent.\n"
-     ]
-    }
-   ],
-   "source": [
-    "await on_message(Message(content=\"Hi AI, how are you today?\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "b850294c-7f8f-4e79-adcf-47e4e3a898df",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langsmith import Client\n",
-    "\n",
-    "client = Client()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "6d089ddc-69bc-45a8-b8db-9962e4f1f5ee",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from itertools import islice\n",
-    "\n",
-    "runs = list(islice(client.list_runs(), 10))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "id": "f0349fac-5a98-400f-ba03-61ed4e1332be",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "runs = sorted(runs, key=lambda x: x.start_time, reverse=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "02f133f0-39ee-4b46-b443-12c1f9b76fff",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "ids = [run.id for run in runs]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "3366dce4-0c38-4a7d-8111-046a58b24917",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "runs2 = list(client.list_runs(id=ids))\n",
-    "runs2 = sorted(runs2, key=lambda x: x.start_time, reverse=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "82915b90-39a0-47d6-9121-56a13f210f52",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['a36092d2-4ad5-4fb4-9b0d-0dba9a2ed836',\n",
-       " '9398e6be-964f-4aa4-8de9-ad78cd4b7074']"
-      ]
-     },
-     "execution_count": 42,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "[str(x) for x in ids[:2]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "id": "f610ec91-dc48-4a17-91c5-5c4675c77abc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langsmith.run_helpers import traceable\n",
-    "\n",
-    "@traceable(run_type=\"llm\", name=\"\"\"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/dQw4w9WgXcQ?start=5\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" allowfullscreen></iframe>\"\"\")\n",
-    "def foo():\n",
-    "    return \"bar\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "bd317bd7-8b2a-433a-8ec3-098a84ba8e64",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'bar'"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "foo()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "id": "b142519b-6885-415c-83b9-4a346fb90589",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.llms import AzureOpenAI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5c50bb2b-72b8-4322-9b16-d857ecd9f347",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb
+++ b/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb
@ -1,468 +1,469 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "4cf569a7-9a1d-4489-934e-50e57760c907",
-   "metadata": {},
-   "source": [
-    "# Criteria Evaluation\n",
-    "\n",
-    "In scenarios where you wish to assess a model's output using a specific rubric or criteria set, the `criteria` evaluator proves to be a handy tool. It allows you to verify if an LLM or Chain's output complies with a defined set of criteria.\n",
-    "\n",
-    "To understand its functionality and configurability in depth, refer to the reference documentation of the [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) class.\n",
-    "\n",
-    "### Usage without references\n",
-    "\n",
-    "In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are \"concise\"."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "6005ebe8-551e-47a5-b4df-80575a068552",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"criteria\", criteria=\"conciseness\")\n",
-    "\n",
-    "# This is equivalent to loading using the enum\n",
-    "from langchain.evaluation import EvaluatorType\n",
-    "\n",
-    "evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=\"conciseness\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "22f83fb8-82f4-4310-a877-68aaa0789199",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': 'The criterion is conciseness, which means the submission should be brief and to the point. \\n\\nLooking at the submission, the answer to the question \"What\\'s 2+2?\" is indeed \"four\". However, the respondent has added extra information, stating \"That\\'s an elementary question.\" This statement does not contribute to answering the question and therefore makes the response less concise.\\n\\nTherefore, the submission does not meet the criterion of conciseness.\\n\\nN', 'value': 'N', 'score': 0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "eval_result = evaluator.evaluate_strings(\n",
-    "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
-    "    input=\"What's 2+2?\",\n",
-    ")\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "35e61e4d-b776-4f6b-8c89-da5d3604134a",
-   "metadata": {},
-   "source": [
-    "#### Output Format\n",
-    "\n",
-    "All string evaluators expose an [evaluate_strings](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html?highlight=evaluate_strings#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.evaluate_strings) (or async [aevaluate_strings](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html?highlight=evaluate_strings#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.aevaluate_strings)) method, which accepts:\n",
-    "\n",
-    "- input (str) – The input to the agent.\n",
-    "- prediction (str) – The predicted response.\n",
-    "\n",
-    "The criteria evaluators return a dictionary with the following values:\n",
-    "- score: Binary integeer 0 to 1, where 1 would mean that the output is compliant with the criteria, and 0 otherwise\n",
-    "- value: A \"Y\" or \"N\" corresponding to the score\n",
-    "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
-   "metadata": {},
-   "source": [
-    "## Using Reference Labels\n",
-    "\n",
-    "Some criteria (such as correctness) require reference labels to work correctly. To do this, initialize the `labeled_criteria` evaluator and call the evaluator with a `reference` string."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "With ground truth: 1\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluator = load_evaluator(\"labeled_criteria\", criteria=\"correctness\")\n",
-    "\n",
-    "# We can even override the model's learned knowledge using ground truth labels\n",
-    "eval_result = evaluator.evaluate_strings(\n",
-    "    input=\"What is the capital of the US?\",\n",
-    "    prediction=\"Topeka, KS\",\n",
-    "    reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
-    ")\n",
-    "print(f'With ground truth: {eval_result[\"score\"]}')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e05b5748-d373-4ff8-85d9-21da4641e84c",
-   "metadata": {},
-   "source": [
-    "**Default Criteria**\n",
-    "\n",
-    "Most of the time, you'll want to define your own custom criteria (see below), but we also provide some common criteria you can load with a single string.\n",
-    "Here's a list of pre-implemented criteria. Note that in the absence of labels, the LLM merely predicts what it thinks the best answer is and is not grounded in actual law or context."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "47de7359-db3e-4cad-bcfa-4fe834dea893",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<Criteria.CONCISENESS: 'conciseness'>,\n",
-       " <Criteria.RELEVANCE: 'relevance'>,\n",
-       " <Criteria.CORRECTNESS: 'correctness'>,\n",
-       " <Criteria.COHERENCE: 'coherence'>,\n",
-       " <Criteria.HARMFULNESS: 'harmfulness'>,\n",
-       " <Criteria.MALICIOUSNESS: 'maliciousness'>,\n",
-       " <Criteria.HELPFULNESS: 'helpfulness'>,\n",
-       " <Criteria.CONTROVERSIALITY: 'controversiality'>,\n",
-       " <Criteria.MISOGYNY: 'misogyny'>,\n",
-       " <Criteria.CRIMINALITY: 'criminality'>,\n",
-       " <Criteria.INSENSITIVITY: 'insensitivity'>]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation import Criteria\n",
-    "\n",
-    "# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
-    "list(Criteria)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "077c4715-e857-44a3-9f87-346642586a8d",
-   "metadata": {},
-   "source": [
-    "## Custom Criteria\n",
-    "\n",
-    "To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `\"criterion_name\": \"criterion_description\"`\n",
-    "\n",
-    "Note: it's recommended that you create a single evaluator per criterion. This way, separate feedback can be provided for each aspect. Additionally, if you provide antagonistic criteria, the evaluator won't be very useful, as it will be configured to predict compliance for ALL of the criteria provided."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "bafa0a11-2617-4663-84bf-24df7d0736be",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': \"The criterion asks if the output contains numeric or mathematical information. The joke in the submission does contain mathematical information. It refers to the mathematical concept of squaring a number and also mentions 'pi', which is a mathematical constant. Therefore, the submission does meet the criterion.\\n\\nY\", 'value': 'Y', 'score': 1}\n",
-      "{'reasoning': 'Let\\'s assess the submission based on the given criteria:\\n\\n1. Numeric: The output does not contain any explicit numeric information. The word \"square\" and \"pi\" are mathematical terms but they are not numeric information per se.\\n\\n2. Mathematical: The output does contain mathematical information. The terms \"square\" and \"pi\" are mathematical terms. The joke is a play on the mathematical concept of squaring a number (in this case, pi).\\n\\n3. Grammatical: The output is grammatically correct. The sentence structure, punctuation, and word usage are all correct.\\n\\n4. Logical: The output is logical. It makes sense within the context of the joke. The joke is a play on words between the mathematical concept of squaring a number (pi) and eating a square pie.\\n\\nBased on the above analysis, the submission does not meet all the criteria because it does not contain numeric information.\\nN', 'value': 'N', 'score': 0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "custom_criterion = {\"numeric\": \"Does the output contain numeric or mathematical information?\"}\n",
-    "\n",
-    "eval_chain = load_evaluator(\n",
-    "    EvaluatorType.CRITERIA,\n",
-    "    criteria=custom_criterion,\n",
-    ")\n",
-    "query = \"Tell me a joke\"\n",
-    "prediction = \"I ate some square pie but I don't know the square of pi.\"\n",
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
-    "print(eval_result)\n",
-    "\n",
-    "# If you wanted to specify multiple criteria. Generally not recommended\n",
-    "custom_criteria = {\n",
-    "    \"numeric\": \"Does the output contain numeric information?\",\n",
-    "    \"mathematical\": \"Does the output contain mathematical information?\",\n",
-    "    \"grammatical\": \"Is the output grammatically correct?\",\n",
-    "    \"logical\": \"Is the output logical?\",\n",
-    "}\n",
-    "\n",
-    "eval_chain = load_evaluator(\n",
-    "    EvaluatorType.CRITERIA,\n",
-    "    criteria=custom_criteria,\n",
-    ")\n",
-    "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
-    "print(\"Multi-criteria evaluation\")\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
-   "metadata": {},
-   "source": [
-    "## Using Constitutional Principles\n",
-    "\n",
-    "Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to\n",
-    "instantiate the chain and take advantage of the many existing principles in LangChain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "99e3c242-5b12-4bd5-b487-64990a159655",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "54 available principles\n"
-     ]
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "4cf569a7-9a1d-4489-934e-50e57760c907",
+            "metadata": {},
+            "source": [
+                "# Criteria Evaluation\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb)\n",
+                "\n",
+                "In scenarios where you wish to assess a model's output using a specific rubric or criteria set, the `criteria` evaluator proves to be a handy tool. It allows you to verify if an LLM or Chain's output complies with a defined set of criteria.\n",
+                "\n",
+                "To understand its functionality and configurability in depth, refer to the reference documentation of the [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) class.\n",
+                "\n",
+                "### Usage without references\n",
+                "\n",
+                "In this example, you will use the `CriteriaEvalChain` to check whether an output is concise. First, create the evaluation chain to predict whether outputs are \"concise\"."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "6005ebe8-551e-47a5-b4df-80575a068552",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"criteria\", criteria=\"conciseness\")\n",
+                "\n",
+                "# This is equivalent to loading using the enum\n",
+                "from langchain.evaluation import EvaluatorType\n",
+                "\n",
+                "evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=\"conciseness\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "22f83fb8-82f4-4310-a877-68aaa0789199",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "{'reasoning': 'The criterion is conciseness, which means the submission should be brief and to the point. \\n\\nLooking at the submission, the answer to the question \"What\\'s 2+2?\" is indeed \"four\". However, the respondent has added extra information, stating \"That\\'s an elementary question.\" This statement does not contribute to answering the question and therefore makes the response less concise.\\n\\nTherefore, the submission does not meet the criterion of conciseness.\\n\\nN', 'value': 'N', 'score': 0}\n"
+                    ]
+                }
+            ],
+            "source": [
+                "eval_result = evaluator.evaluate_strings(\n",
+                "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
+                "    input=\"What's 2+2?\",\n",
+                ")\n",
+                "print(eval_result)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "35e61e4d-b776-4f6b-8c89-da5d3604134a",
+            "metadata": {},
+            "source": [
+                "#### Output Format\n",
+                "\n",
+                "All string evaluators expose an [evaluate_strings](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html?highlight=evaluate_strings#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.evaluate_strings) (or async [aevaluate_strings](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html?highlight=evaluate_strings#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.aevaluate_strings)) method, which accepts:\n",
+                "\n",
+                "- input (str) – The input to the agent.\n",
+                "- prediction (str) – The predicted response.\n",
+                "\n",
+                "The criteria evaluators return a dictionary with the following values:\n",
+                "- score: Binary integeer 0 to 1, where 1 would mean that the output is compliant with the criteria, and 0 otherwise\n",
+                "- value: A \"Y\" or \"N\" corresponding to the score\n",
+                "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
+            "metadata": {},
+            "source": [
+                "## Using Reference Labels\n",
+                "\n",
+                "Some criteria (such as correctness) require reference labels to work correctly. To do this, initialize the `labeled_criteria` evaluator and call the evaluator with a `reference` string."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "With ground truth: 1\n"
+                    ]
+                }
+            ],
+            "source": [
+                "evaluator = load_evaluator(\"labeled_criteria\", criteria=\"correctness\")\n",
+                "\n",
+                "# We can even override the model's learned knowledge using ground truth labels\n",
+                "eval_result = evaluator.evaluate_strings(\n",
+                "    input=\"What is the capital of the US?\",\n",
+                "    prediction=\"Topeka, KS\",\n",
+                "    reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
+                ")\n",
+                "print(f'With ground truth: {eval_result[\"score\"]}')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "e05b5748-d373-4ff8-85d9-21da4641e84c",
+            "metadata": {},
+            "source": [
+                "**Default Criteria**\n",
+                "\n",
+                "Most of the time, you'll want to define your own custom criteria (see below), but we also provide some common criteria you can load with a single string.\n",
+                "Here's a list of pre-implemented criteria. Note that in the absence of labels, the LLM merely predicts what it thinks the best answer is and is not grounded in actual law or context."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "47de7359-db3e-4cad-bcfa-4fe834dea893",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "[<Criteria.CONCISENESS: 'conciseness'>,\n",
+                            " <Criteria.RELEVANCE: 'relevance'>,\n",
+                            " <Criteria.CORRECTNESS: 'correctness'>,\n",
+                            " <Criteria.COHERENCE: 'coherence'>,\n",
+                            " <Criteria.HARMFULNESS: 'harmfulness'>,\n",
+                            " <Criteria.MALICIOUSNESS: 'maliciousness'>,\n",
+                            " <Criteria.HELPFULNESS: 'helpfulness'>,\n",
+                            " <Criteria.CONTROVERSIALITY: 'controversiality'>,\n",
+                            " <Criteria.MISOGYNY: 'misogyny'>,\n",
+                            " <Criteria.CRIMINALITY: 'criminality'>,\n",
+                            " <Criteria.INSENSITIVITY: 'insensitivity'>]"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from langchain.evaluation import Criteria\n",
+                "\n",
+                "# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
+                "list(Criteria)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "077c4715-e857-44a3-9f87-346642586a8d",
+            "metadata": {},
+            "source": [
+                "## Custom Criteria\n",
+                "\n",
+                "To evaluate outputs against your own custom criteria, or to be more explicit the definition of any of the default criteria, pass in a dictionary of `\"criterion_name\": \"criterion_description\"`\n",
+                "\n",
+                "Note: it's recommended that you create a single evaluator per criterion. This way, separate feedback can be provided for each aspect. Additionally, if you provide antagonistic criteria, the evaluator won't be very useful, as it will be configured to predict compliance for ALL of the criteria provided."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 19,
+            "id": "bafa0a11-2617-4663-84bf-24df7d0736be",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "{'reasoning': \"The criterion asks if the output contains numeric or mathematical information. The joke in the submission does contain mathematical information. It refers to the mathematical concept of squaring a number and also mentions 'pi', which is a mathematical constant. Therefore, the submission does meet the criterion.\\n\\nY\", 'value': 'Y', 'score': 1}\n",
+                        "{'reasoning': 'Let\\'s assess the submission based on the given criteria:\\n\\n1. Numeric: The output does not contain any explicit numeric information. The word \"square\" and \"pi\" are mathematical terms but they are not numeric information per se.\\n\\n2. Mathematical: The output does contain mathematical information. The terms \"square\" and \"pi\" are mathematical terms. The joke is a play on the mathematical concept of squaring a number (in this case, pi).\\n\\n3. Grammatical: The output is grammatically correct. The sentence structure, punctuation, and word usage are all correct.\\n\\n4. Logical: The output is logical. It makes sense within the context of the joke. The joke is a play on words between the mathematical concept of squaring a number (pi) and eating a square pie.\\n\\nBased on the above analysis, the submission does not meet all the criteria because it does not contain numeric information.\\nN', 'value': 'N', 'score': 0}\n"
+                    ]
+                }
+            ],
+            "source": [
+                "custom_criterion = {\"numeric\": \"Does the output contain numeric or mathematical information?\"}\n",
+                "\n",
+                "eval_chain = load_evaluator(\n",
+                "    EvaluatorType.CRITERIA,\n",
+                "    criteria=custom_criterion,\n",
+                ")\n",
+                "query = \"Tell me a joke\"\n",
+                "prediction = \"I ate some square pie but I don't know the square of pi.\"\n",
+                "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
+                "print(eval_result)\n",
+                "\n",
+                "# If you wanted to specify multiple criteria. Generally not recommended\n",
+                "custom_criteria = {\n",
+                "    \"numeric\": \"Does the output contain numeric information?\",\n",
+                "    \"mathematical\": \"Does the output contain mathematical information?\",\n",
+                "    \"grammatical\": \"Is the output grammatically correct?\",\n",
+                "    \"logical\": \"Is the output logical?\",\n",
+                "}\n",
+                "\n",
+                "eval_chain = load_evaluator(\n",
+                "    EvaluatorType.CRITERIA,\n",
+                "    criteria=custom_criteria,\n",
+                ")\n",
+                "eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
+                "print(\"Multi-criteria evaluation\")\n",
+                "print(eval_result)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "07485cce-8d52-43a0-bdad-76ec7dacfb51",
+            "metadata": {},
+            "source": [
+                "## Using Constitutional Principles\n",
+                "\n",
+                "Custom rubrics are similar to principles from [Constitutional AI](https://arxiv.org/abs/2212.08073). You can directly use your `ConstitutionalPrinciple` objects to\n",
+                "instantiate the chain and take advantage of the many existing principles in LangChain."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 9,
+            "id": "99e3c242-5b12-4bd5-b487-64990a159655",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "54 available principles\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "text/plain": [
+                            "[('harmful1',\n",
+                            "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),\n",
+                            " ('harmful2',\n",
+                            "  ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),\n",
+                            " ('harmful3',\n",
+                            "  ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),\n",
+                            " ('harmful4',\n",
+                            "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),\n",
+                            " ('insensitive',\n",
+                            "  ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
+                        ]
+                    },
+                    "execution_count": 9,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from langchain.chains.constitutional_ai.principles import PRINCIPLES\n",
+                "\n",
+                "print(f\"{len(PRINCIPLES)} available principles\")\n",
+                "list(PRINCIPLES.items())[:5]"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 12,
+            "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nLooking at the assistant\\'s response, it is clear that it is not racist or sexist as it does not discriminate or stereotype based on race or gender. \\n\\nThe response is also not illegal as it does not involve any criminal activity or encourage any form of illegal behavior.\\n\\nThe response is not dangerous as it does not pose a physical threat or risk to anyone\\'s safety.\\n\\nHowever, the assistant\\'s response can be considered harmful and toxic as it uses derogatory language (\"lilly-livered nincompoop\") to describe \\'Will\\'. This can be seen as a form of verbal abuse or insult, which can cause emotional harm.\\n\\nThe response can also be seen as unethical, as it is generally considered inappropriate to insult or belittle someone in this manner.\\n\\nN', 'value': 'N', 'score': 0}\n"
+                    ]
+                }
+            ],
+            "source": [
+                "evaluator = load_evaluator(\n",
+                "    EvaluatorType.CRITERIA, criteria=PRINCIPLES[\"harmful1\"]\n",
+                ")\n",
+                "eval_result = evaluator.evaluate_strings(\n",
+                "    prediction=\"I say that man is a lilly-livered nincompoop\",\n",
+                "    input=\"What do you think of Will?\",\n",
+                ")\n",
+                "print(eval_result)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "ae60b5e3-ceac-46b1-aabb-ee36930cb57c",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "## Configuring the LLM\n",
+                "\n",
+                "If you don't specify an eval LLM, the `load_evaluator` method will initialize a `gpt-4` LLM to power the grading chain. Below, use an anthropic model instead."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 13,
+            "id": "1717162d-f76c-4a14-9ade-168d6fa42b7a",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# %pip install ChatAnthropic\n",
+                "# %env ANTHROPIC_API_KEY=<API_KEY>"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 14,
+            "id": "8727e6f4-aaba-472d-bb7d-09fc1a0f0e2a",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.chat_models import ChatAnthropic\n",
+                "\n",
+                "llm = ChatAnthropic(temperature=0)\n",
+                "evaluator = load_evaluator(\"criteria\", llm=llm, criteria=\"conciseness\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 15,
+            "id": "3f6f0d8b-cf42-4241-85ae-35b3ce8152a0",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "{'reasoning': 'Step 1) Analyze the conciseness criterion: Is the submission concise and to the point?\\nStep 2) The submission provides extraneous information beyond just answering the question directly. It characterizes the question as \"elementary\" and provides reasoning for why the answer is 4. This additional commentary makes the submission not fully concise.\\nStep 3) Therefore, based on the analysis of the conciseness criterion, the submission does not meet the criteria.\\n\\nN', 'value': 'N', 'score': 0}\n"
+                    ]
+                }
+            ],
+            "source": [
+                "eval_result = evaluator.evaluate_strings(\n",
+                "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
+                "    input=\"What's 2+2?\",\n",
+                ")\n",
+                "print(eval_result)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "5e7fc7bb-3075-4b44-9c16-3146a39ae497",
+            "metadata": {},
+            "source": [
+                "# Configuring the Prompt\n",
+                "\n",
+                "If you want to completely customize the prompt, you can initialize the evaluator with a custom prompt template as follows."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 16,
+            "id": "22e57704-682f-44ff-96ba-e915c73269c0",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.prompts import PromptTemplate\n",
+                "\n",
+                "fstring = \"\"\"Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:\n",
+                "\n",
+                "Grading Rubric: {criteria}\n",
+                "Expected Response: {reference}\n",
+                "\n",
+                "DATA:\n",
+                "---------\n",
+                "Question: {input}\n",
+                "Response: {output}\n",
+                "---------\n",
+                "Write out your explanation for each criterion, then respond with Y or N on a new line.\"\"\"\n",
+                "\n",
+                "prompt = PromptTemplate.from_template(fstring)\n",
+                "\n",
+                "evaluator = load_evaluator(\n",
+                "    \"labeled_criteria\", criteria=\"correctness\", prompt=prompt\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 17,
+            "id": "5d6b0eca-7aea-4073-a65a-18c3a9cdb5af",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "{'reasoning': 'Correctness: No, the response is not correct. The expected response was \"It\\'s 17 now.\" but the response given was \"What\\'s 2+2? That\\'s an elementary question. The answer you\\'re looking for is that two and two is four.\"', 'value': 'N', 'score': 0}\n"
+                    ]
+                }
+            ],
+            "source": [
+                "eval_result = evaluator.evaluate_strings(\n",
+                "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
+                "    input=\"What's 2+2?\",\n",
+                "    reference=\"It's 17 now.\",\n",
+                ")\n",
+                "print(eval_result)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "f2662405-353a-4a73-b867-784d12cafcf1",
+            "metadata": {},
+            "source": [
+                "## Conclusion\n",
+                "\n",
+                "In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.\n",
+                "\n",
+                "Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "a684e2f1",
+            "metadata": {},
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
    },
-    {
-     "data": {
-      "text/plain": [
-       "[('harmful1',\n",
-       "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s last response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.', revision_request='Please rewrite the assistant response to remove any and all harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.', name='harmful1')),\n",
-       " ('harmful2',\n",
-       "  ConstitutionalPrinciple(critique_request='Explain ways in which the assistant’s preceding response may be harmful to either the human or others. For example, if the assistant says anything that encourages dangerous or illegal activity, make sure to point it out. Other examples of harm include racism, sexism, and other social biases.', revision_request='Please rewrite the assistant response to remove any and all harmful content, including anything dangerous, illegal, racist, sexist or socially biased.', name='harmful2')),\n",
-       " ('harmful3',\n",
-       "  ConstitutionalPrinciple(critique_request='Identify all ways in which the assistant’s last response is harmful, unethical, or socially biased. Furthermore, provide specific details on how the assistant can improve its response.', revision_request='Please rewrite the assistant response to remove all harmful, unethical, or socially biased content, and move the conversation in a positive direction.', name='harmful3')),\n",
-       " ('harmful4',\n",
-       "  ConstitutionalPrinciple(critique_request='Identify specific ways in which the assistant’s precedings response is harmful. Also point out potential harmfulness in the human’s questions and responses. Examples of harm include but are not limited to racism, sexism, toxicity.', revision_request='Rewrite the assistant response to remove all harmful content, and to politely point out any harmful assumptions from the human, such as racism, sexism, toxicity.', name='harmful4')),\n",
-       " ('insensitive',\n",
-       "  ConstitutionalPrinciple(critique_request='Discuss the assistant’s last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistant’s last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.chains.constitutional_ai.principles import PRINCIPLES\n",
-    "\n",
-    "print(f\"{len(PRINCIPLES)} available principles\")\n",
-    "list(PRINCIPLES.items())[:5]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nLooking at the assistant\\'s response, it is clear that it is not racist or sexist as it does not discriminate or stereotype based on race or gender. \\n\\nThe response is also not illegal as it does not involve any criminal activity or encourage any form of illegal behavior.\\n\\nThe response is not dangerous as it does not pose a physical threat or risk to anyone\\'s safety.\\n\\nHowever, the assistant\\'s response can be considered harmful and toxic as it uses derogatory language (\"lilly-livered nincompoop\") to describe \\'Will\\'. This can be seen as a form of verbal abuse or insult, which can cause emotional harm.\\n\\nThe response can also be seen as unethical, as it is generally considered inappropriate to insult or belittle someone in this manner.\\n\\nN', 'value': 'N', 'score': 0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluator = load_evaluator(\n",
-    "    EvaluatorType.CRITERIA, criteria=PRINCIPLES[\"harmful1\"]\n",
-    ")\n",
-    "eval_result = evaluator.evaluate_strings(\n",
-    "    prediction=\"I say that man is a lilly-livered nincompoop\",\n",
-    "    input=\"What do you think of Will?\",\n",
-    ")\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ae60b5e3-ceac-46b1-aabb-ee36930cb57c",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Configuring the LLM\n",
-    "\n",
-    "If you don't specify an eval LLM, the `load_evaluator` method will initialize a `gpt-4` LLM to power the grading chain. Below, use an anthropic model instead."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "1717162d-f76c-4a14-9ade-168d6fa42b7a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install ChatAnthropic\n",
-    "# %env ANTHROPIC_API_KEY=<API_KEY>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "8727e6f4-aaba-472d-bb7d-09fc1a0f0e2a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatAnthropic\n",
-    "\n",
-    "llm = ChatAnthropic(temperature=0)\n",
-    "evaluator = load_evaluator(\"criteria\", llm=llm, criteria=\"conciseness\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "3f6f0d8b-cf42-4241-85ae-35b3ce8152a0",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': 'Step 1) Analyze the conciseness criterion: Is the submission concise and to the point?\\nStep 2) The submission provides extraneous information beyond just answering the question directly. It characterizes the question as \"elementary\" and provides reasoning for why the answer is 4. This additional commentary makes the submission not fully concise.\\nStep 3) Therefore, based on the analysis of the conciseness criterion, the submission does not meet the criteria.\\n\\nN', 'value': 'N', 'score': 0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "eval_result = evaluator.evaluate_strings(\n",
-    "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
-    "    input=\"What's 2+2?\",\n",
-    ")\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5e7fc7bb-3075-4b44-9c16-3146a39ae497",
-   "metadata": {},
-   "source": [
-    "# Configuring the Prompt\n",
-    "\n",
-    "If you want to completely customize the prompt, you can initialize the evaluator with a custom prompt template as follows."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "22e57704-682f-44ff-96ba-e915c73269c0",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.prompts import PromptTemplate\n",
-    "\n",
-    "fstring = \"\"\"Respond Y or N based on how well the following response follows the specified rubric. Grade only based on the rubric and expected response:\n",
-    "\n",
-    "Grading Rubric: {criteria}\n",
-    "Expected Response: {reference}\n",
-    "\n",
-    "DATA:\n",
-    "---------\n",
-    "Question: {input}\n",
-    "Response: {output}\n",
-    "---------\n",
-    "Write out your explanation for each criterion, then respond with Y or N on a new line.\"\"\"\n",
-    "\n",
-    "prompt = PromptTemplate.from_template(fstring)\n",
-    "\n",
-    "evaluator = load_evaluator(\n",
-    "    \"labeled_criteria\", criteria=\"correctness\", prompt=prompt\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "5d6b0eca-7aea-4073-a65a-18c3a9cdb5af",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'reasoning': 'Correctness: No, the response is not correct. The expected response was \"It\\'s 17 now.\" but the response given was \"What\\'s 2+2? That\\'s an elementary question. The answer you\\'re looking for is that two and two is four.\"', 'value': 'N', 'score': 0}\n"
-     ]
-    }
-   ],
-   "source": [
-    "eval_result = evaluator.evaluate_strings(\n",
-    "    prediction=\"What's 2+2? That's an elementary question. The answer you're looking for is that two and two is four.\",\n",
-    "    input=\"What's 2+2?\",\n",
-    "    reference=\"It's 17 now.\",\n",
-    ")\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f2662405-353a-4a73-b867-784d12cafcf1",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "In these examples, you used the `CriteriaEvalChain` to evaluate model outputs against custom criteria, including a custom rubric and constitutional principles.\n",
-    "\n",
-    "Remember when selecting criteria to decide whether they ought to require ground truth labels or not. Things like \"correctness\" are best evaluated with ground truth or with extensive context. Also, remember to pick aligned principles for a given chain so that the classification makes sense."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a684e2f1",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/string/custom.ipynb
+++ b/docs/extras/guides/evaluation/string/custom.ipynb
@ -1,208 +1,209 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "4460f924-1738-4dc5-999f-c26383aba0a4",
-   "metadata": {},
-   "source": [
-    "# Custom String Evaluator\n",
-    "\n",
-    "You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
-    "\n",
-    "In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
-    "[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install evaluate > /dev/null"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from typing import Any, Optional\n",
-    "\n",
-    "from langchain.evaluation import StringEvaluator\n",
-    "from evaluate import load\n",
-    "\n",
-    "\n",
-    "class PerplexityEvaluator(StringEvaluator):\n",
-    "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
-    "\n",
-    "    def __init__(self, model_id: str = \"gpt2\"):\n",
-    "        self.model_id = model_id\n",
-    "        self.metric_fn = load(\n",
-    "            \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
-    "        )\n",
-    "\n",
-    "    def _evaluate_strings(\n",
-    "        self,\n",
-    "        *,\n",
-    "        prediction: str,\n",
-    "        reference: Optional[str] = None,\n",
-    "        input: Optional[str] = None,\n",
-    "        **kwargs: Any,\n",
-    "    ) -> dict:\n",
-    "        results = self.metric_fn.compute(\n",
-    "            predictions=[prediction], model_id=self.model_id\n",
-    "        )\n",
-    "        ppl = results[\"perplexities\"][0]\n",
-    "        return {\"score\": ppl}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "evaluator = PerplexityEvaluator()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using pad_token, but it is not set yet.\n"
-     ]
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "4460f924-1738-4dc5-999f-c26383aba0a4",
+            "metadata": {},
+            "source": [
+                "# Custom String Evaluator\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/custom.ipynb)\n",
+                "\n",
+                "You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
+                "\n",
+                "In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
+                "[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# %pip install evaluate > /dev/null"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from typing import Any, Optional\n",
+                "\n",
+                "from langchain.evaluation import StringEvaluator\n",
+                "from evaluate import load\n",
+                "\n",
+                "\n",
+                "class PerplexityEvaluator(StringEvaluator):\n",
+                "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
+                "\n",
+                "    def __init__(self, model_id: str = \"gpt2\"):\n",
+                "        self.model_id = model_id\n",
+                "        self.metric_fn = load(\n",
+                "            \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
+                "        )\n",
+                "\n",
+                "    def _evaluate_strings(\n",
+                "        self,\n",
+                "        *,\n",
+                "        prediction: str,\n",
+                "        reference: Optional[str] = None,\n",
+                "        input: Optional[str] = None,\n",
+                "        **kwargs: Any,\n",
+                "    ) -> dict:\n",
+                "        results = self.metric_fn.compute(\n",
+                "            predictions=[prediction], model_id=self.model_id\n",
+                "        )\n",
+                "        ppl = results[\"perplexities\"][0]\n",
+                "        return {\"score\": ppl}"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "evaluator = PerplexityEvaluator()"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Using pad_token, but it is not set yet.\n"
+                    ]
+                },
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+                        "To disable this warning, you can either:\n",
+                        "\t- Avoid using `tokenizers` before the fork if possible\n",
+                        "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "application/vnd.jupyter.widget-view+json": {
+                            "model_id": "467109d44654486e8b415288a319fc2c",
+                            "version_major": 2,
+                            "version_minor": 0
+                        },
+                        "text/plain": [
+                            "  0%|          | 0/1 [00:00<?, ?it/s]"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                },
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 190.3675537109375}"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "name": "stderr",
+                    "output_type": "stream",
+                    "text": [
+                        "Using pad_token, but it is not set yet.\n"
+                    ]
+                },
+                {
+                    "data": {
+                        "application/vnd.jupyter.widget-view+json": {
+                            "model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
+                            "version_major": 2,
+                            "version_minor": 0
+                        },
+                        "text/plain": [
+                            "  0%|          | 0/1 [00:00<?, ?it/s]"
+                        ]
+                    },
+                    "metadata": {},
+                    "output_type": "display_data"
+                },
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1982.0709228515625}"
+                        ]
+                    },
+                    "execution_count": 6,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
+                "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "467109d44654486e8b415288a319fc2c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 190.3675537109375}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Using pad_token, but it is not set yet.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1982.0709228515625}"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
-    "evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/string/embedding_distance.ipynb
+++ b/docs/extras/guides/evaluation/string/embedding_distance.ipynb
@ -1,223 +1,224 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Embedding Distance\n",
-    "\n",
-    "To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
-    "\n",
-    "\n",
-    "**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
-    "\n",
-    "Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"embedding_distance\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.0966466944859925}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.03761174337464557}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Select the Distance Metric\n",
-    "\n",
-    "By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<EmbeddingDistance.COSINE: 'cosine'>,\n",
-       " <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
-       " <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
-       " <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
-       " <EmbeddingDistance.HAMMING: 'hamming'>]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation import EmbeddingDistance\n",
-    "\n",
-    "list(EmbeddingDistance)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# You can load by enum or by raw python string\n",
-    "evaluator = load_evaluator(\n",
-    "    \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Select Embeddings to Use\n",
-    "\n",
-    "The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "\n",
-    "embedding_model = HuggingFaceEmbeddings()\n",
-    "hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.5486443280477362}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.21018880025138598}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "# Embedding Distance\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/embedding_distance.ipynb)\n",
+                "\n",
+                "To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
+                "\n",
+                "\n",
+                "**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
+                "\n",
+                "Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"embedding_distance\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.0966466944859925}"
+                        ]
+                    },
+                    "execution_count": 2,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.03761174337464557}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Select the Distance Metric\n",
+                "\n",
+                "By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "[<EmbeddingDistance.COSINE: 'cosine'>,\n",
+                            " <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
+                            " <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
+                            " <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
+                            " <EmbeddingDistance.HAMMING: 'hamming'>]"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from langchain.evaluation import EmbeddingDistance\n",
+                "\n",
+                "list(EmbeddingDistance)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# You can load by enum or by raw python string\n",
+                "evaluator = load_evaluator(\n",
+                "    \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "## Select Embeddings to Use\n",
+                "\n",
+                "The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.embeddings import HuggingFaceEmbeddings\n",
+                "\n",
+                "embedding_model = HuggingFaceEmbeddings()\n",
+                "hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.5486443280477362}"
+                        ]
+                    },
+                    "execution_count": 7,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.21018880025138598}"
+                        ]
+                    },
+                    "execution_count": 8,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 4
+}
--- a/docs/extras/guides/evaluation/string/exact_match.ipynb
+++ b/docs/extras/guides/evaluation/string/exact_match.ipynb
@ -0,0 +1,175 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "2da95378",
+            "metadata": {},
+            "source": [
+                "# Exact Match\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/exact_match.ipynb)\n",
+                "\n",
+                "Probably the simplest ways to evaluate an LLM or runnable's string output against a reference label is by a simple string equivalence.\n",
+                "\n",
+                "This can be accessed using the `exact_match` evaluator."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import ExactMatchStringEvaluator\n",
+                "\n",
+                "evaluator = ExactMatchStringEvaluator()"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
+            "metadata": {},
+            "source": [
+                "Alternatively via the loader:"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "f6790c46",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"exact_match\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "49ad9139",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"1 LLM.\",\n",
+                "    reference=\"2 llm\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0}"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"LangChain\",\n",
+                "    reference=\"langchain\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
+            "metadata": {},
+            "source": [
+                "## Configure the ExactMatchStringEvaluator\n",
+                "\n",
+                "You can relax the \"exactness\" when comparing strings."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "evaluator = ExactMatchStringEvaluator(\n",
+                "    ignore_case=True,\n",
+                "    ignore_numbers=True,\n",
+                "    ignore_punctuation=True,\n",
+                ")\n",
+                "\n",
+                "# Alternatively\n",
+                "# evaluator = load_evaluator(\"exact_match\", ignore_case=True, ignore_numbers=True, ignore_punctuation=True)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 6,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"1 LLM.\",\n",
+                "    reference=\"2 llm\",\n",
+                ")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/string/regex_match.ipynb
+++ b/docs/extras/guides/evaluation/string/regex_match.ipynb
@ -0,0 +1,243 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "2da95378",
+            "metadata": {},
+            "source": [
+                "# Regex Match\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/regex_match.ipynb)\n",
+                "\n",
+                "To evaluate chain or runnable string predictions against a custom regex, you can use the `regex_match` evaluator."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import RegexMatchStringEvaluator\n",
+                "\n",
+                "evaluator = RegexMatchStringEvaluator()"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
+            "metadata": {},
+            "source": [
+                "Alternatively via the loader:"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "f6790c46",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"regex_match\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "49ad9139",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# Check for the presence of a YYYY-MM-DD string.\n",
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The delivery will be made on 2024-01-05\",\n",
+                "    reference=\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0}"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# Check for the presence of a MM-DD-YYYY string.\n",
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The delivery will be made on 2024-01-05\",\n",
+                "    reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "id": "168fcd92-dffb-4345-b097-02d0fedf52fd",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 5,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# Check for the presence of a MM-DD-YYYY string.\n",
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The delivery will be made on 01-05-2024\",\n",
+                "    reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "1d82dab5-6a49-4fe7-b3fb-8bcfb27d26e0",
+            "metadata": {},
+            "source": [
+                "## Match against multiple patterns\n",
+                "\n",
+                "To match against multiple patterns, use a regex union \"|\"."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "b87b915e-b7c2-476b-a452-99688a22293a",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 6,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# Check for the presence of a MM-DD-YYYY string or YYYY-MM-DD\n",
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The delivery will be made on 01-05-2024\",\n",
+                "    reference=\"|\".join([\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\", \".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"])\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
+            "metadata": {},
+            "source": [
+                "## Configure the RegexMatchStringEvaluator\n",
+                "\n",
+                "You can specify any regex flags to use when matching."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "import re\n",
+                "\n",
+                "evaluator = RegexMatchStringEvaluator(\n",
+                "    flags=re.IGNORECASE\n",
+                ")\n",
+                "\n",
+                "# Alternatively\n",
+                "# evaluator = load_evaluator(\"exact_match\", flags=re.IGNORECASE)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1}"
+                        ]
+                    },
+                    "execution_count": 8,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"I LOVE testing\",\n",
+                "    reference=\"I love testing\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "id": "82de8d3e-c829-440e-a582-3fb70cecad3b",
+            "metadata": {},
+            "outputs": [],
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/string/string_distance.ipynb
+++ b/docs/extras/guides/evaluation/string/string_distance.ipynb
@ -1,222 +1,223 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "2da95378",
-   "metadata": {},
-   "source": [
-    "# String Distance\n",
-    "\n",
-    "One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance.  This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
-    "\n",
-    "This can be accessed using the `string_distance` evaluator, which uses distance metric's from the [rapidfuzz](https://github.com/maxbachmann/RapidFuzz) library.\n",
-    "\n",
-    "**Note:** The returned scores are _distances_, meaning lower is typically \"better\".\n",
-    "\n",
-    "For more information, check out the reference docs for the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain) for more info."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "8b47b909-3251-4774-9a7d-e436da4f8979",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install rapidfuzz"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f6790c46",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"string_distance\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "49ad9139",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.11555555555555552}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.evaluate_strings(\n",
-    "    prediction=\"The job is completely done.\",\n",
-    "    reference=\"The job is done\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "c06a2296",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.0724999999999999}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# The results purely character-based, so it's less useful when negation is concerned\n",
-    "evaluator.evaluate_strings(\n",
-    "    prediction=\"The job is done.\",\n",
-    "    reference=\"The job isn't done\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
-   "metadata": {},
-   "source": [
-    "## Configure the String Distance Metric\n",
-    "\n",
-    "By default, the `StringDistanceEvalChain` uses  levenshtein distance, but it also supports other string distance algorithms. Configure using the `distance` argument."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "a88bc7d7-62d3-408d-b0e0-43abcecf35c8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[<StringDistance.DAMERAU_LEVENSHTEIN: 'damerau_levenshtein'>,\n",
-       " <StringDistance.LEVENSHTEIN: 'levenshtein'>,\n",
-       " <StringDistance.JARO: 'jaro'>,\n",
-       " <StringDistance.JARO_WINKLER: 'jaro_winkler'>]"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation import StringDistance\n",
-    "\n",
-    "list(StringDistance)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "jaro_evaluator = load_evaluator(\n",
-    "    \"string_distance\", distance=StringDistance.JARO\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.19259259259259254}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "jaro_evaluator.evaluate_strings(\n",
-    "    prediction=\"The job is completely done.\",\n",
-    "    reference=\"The job is done\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "7020b046-0ef7-40cc-8778-b928e35f3ce1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 0.12083333333333324}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "jaro_evaluator.evaluate_strings(\n",
-    "    prediction=\"The job is done.\",\n",
-    "    reference=\"The job isn't done\",\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "2da95378",
+            "metadata": {},
+            "source": [
+                "# String Distance\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/string_distance.ipynb)\n",
+                "\n",
+                "One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance.  This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
+                "\n",
+                "This can be accessed using the `string_distance` evaluator, which uses distance metric's from the [rapidfuzz](https://github.com/maxbachmann/RapidFuzz) library.\n",
+                "\n",
+                "**Note:** The returned scores are _distances_, meaning lower is typically \"better\".\n",
+                "\n",
+                "For more information, check out the reference docs for the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain) for more info."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "8b47b909-3251-4774-9a7d-e436da4f8979",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# %pip install rapidfuzz"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "f6790c46",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"string_distance\")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "49ad9139",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.11555555555555552}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The job is completely done.\",\n",
+                "    reference=\"The job is done\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "c06a2296",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.0724999999999999}"
+                        ]
+                    },
+                    "execution_count": 4,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "# The results purely character-based, so it's less useful when negation is concerned\n",
+                "evaluator.evaluate_strings(\n",
+                "    prediction=\"The job is done.\",\n",
+                "    reference=\"The job isn't done\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
+            "metadata": {},
+            "source": [
+                "## Configure the String Distance Metric\n",
+                "\n",
+                "By default, the `StringDistanceEvalChain` uses  levenshtein distance, but it also supports other string distance algorithms. Configure using the `distance` argument."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "id": "a88bc7d7-62d3-408d-b0e0-43abcecf35c8",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "[<StringDistance.DAMERAU_LEVENSHTEIN: 'damerau_levenshtein'>,\n",
+                            " <StringDistance.LEVENSHTEIN: 'levenshtein'>,\n",
+                            " <StringDistance.JARO: 'jaro'>,\n",
+                            " <StringDistance.JARO_WINKLER: 'jaro_winkler'>]"
+                        ]
+                    },
+                    "execution_count": 5,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "from langchain.evaluation import StringDistance\n",
+                "\n",
+                "list(StringDistance)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "jaro_evaluator = load_evaluator(\n",
+                "    \"string_distance\", distance=StringDistance.JARO\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
+            "metadata": {},
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.19259259259259254}"
+                        ]
+                    },
+                    "execution_count": 7,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "jaro_evaluator.evaluate_strings(\n",
+                "    prediction=\"The job is completely done.\",\n",
+                "    reference=\"The job is done\",\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "id": "7020b046-0ef7-40cc-8778-b928e35f3ce1",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 0.12083333333333324}"
+                        ]
+                    },
+                    "execution_count": 8,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "jaro_evaluator.evaluate_strings(\n",
+                "    prediction=\"The job is done.\",\n",
+                "    reference=\"The job isn't done\",\n",
+                ")"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/trajectory/custom.ipynb
+++ b/docs/extras/guides/evaluation/trajectory/custom.ipynb
@ -1,141 +1,142 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "db9d627f-b234-4f7f-ab96-639fae474122",
-   "metadata": {},
-   "source": [
-    "# Custom Trajectory Evaluator\n",
-    "\n",
-    "You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
-    "\n",
-    "\n",
-    "In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "ca84ab0c-e7e2-4c03-bd74-9cc4e6338eec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import Any, Optional, Sequence, Tuple\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.chains import LLMChain\n",
-    "from langchain.schema import AgentAction\n",
-    "from langchain.evaluation import AgentTrajectoryEvaluator\n",
-    "\n",
-    "\n",
-    "class StepNecessityEvaluator(AgentTrajectoryEvaluator):\n",
-    "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
-    "\n",
-    "    def __init__(self) -> None:\n",
-    "        llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
-    "        template = \"\"\"Are any of the following steps unnecessary in answering {input}? Provide the verdict on a new line as a single \"Y\" for yes or \"N\" for no.\n",
-    "\n",
-    "        DATA\n",
-    "        ------\n",
-    "        Steps: {trajectory}\n",
-    "        ------\n",
-    "\n",
-    "        Verdict:\"\"\"\n",
-    "        self.chain = LLMChain.from_string(llm, template)\n",
-    "\n",
-    "    def _evaluate_agent_trajectory(\n",
-    "        self,\n",
-    "        *,\n",
-    "        prediction: str,\n",
-    "        input: str,\n",
-    "        agent_trajectory: Sequence[Tuple[AgentAction, str]],\n",
-    "        reference: Optional[str] = None,\n",
-    "        **kwargs: Any,\n",
-    "    ) -> dict:\n",
-    "        vals = [\n",
-    "            f\"{i}: Action=[{action.tool}] returned observation = [{observation}]\"\n",
-    "            for i, (action, observation) in enumerate(agent_trajectory)\n",
-    "        ]\n",
-    "        trajectory = \"\\n\".join(vals)\n",
-    "        response = self.chain.run(dict(trajectory=trajectory, input=input), **kwargs)\n",
-    "        decision = response.split(\"\\n\")[-1].strip()\n",
-    "        score = 1 if decision == \"Y\" else 0\n",
-    "        return {\"score\": score, \"value\": decision, \"reasoning\": response}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
-   "metadata": {},
-   "source": [
-    "The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary. It returns the string 'decision' as the 'value', and includes the rest of the generated text as 'reasoning' to let you audit the decision.\n",
-    "\n",
-    "You can call this evaluator to grade the intermediate steps of your agent's trajectory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "a3fbcc1d-249f-4e00-8841-b6872c73c486",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1, 'value': 'Y', 'reasoning': 'Y'}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator = StepNecessityEvaluator()\n",
-    "\n",
-    "evaluator.evaluate_agent_trajectory(\n",
-    "    prediction=\"The answer is pi\",\n",
-    "    input=\"What is today?\",\n",
-    "    agent_trajectory=[\n",
-    "        (\n",
-    "            AgentAction(tool=\"ask\", tool_input=\"What is today?\", log=\"\"),\n",
-    "            \"tomorrow's yesterday\",\n",
-    "        ),\n",
-    "        (\n",
-    "            AgentAction(tool=\"check_tv\", tool_input=\"Watch tv for half hour\", log=\"\"),\n",
-    "            \"bzzz\",\n",
-    "        ),\n",
-    "    ],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "77353528-723e-4075-939e-aebdb17c1e4f",
-   "metadata": {},
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "db9d627f-b234-4f7f-ab96-639fae474122",
+            "metadata": {},
+            "source": [
+                "# Custom Trajectory Evaluator\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/custom.ipynb)\n",
+                "\n",
+                "You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
+                "\n",
+                "\n",
+                "In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "ca84ab0c-e7e2-4c03-bd74-9cc4e6338eec",
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "from typing import Any, Optional, Sequence, Tuple\n",
+                "from langchain.chat_models import ChatOpenAI\n",
+                "from langchain.chains import LLMChain\n",
+                "from langchain.schema import AgentAction\n",
+                "from langchain.evaluation import AgentTrajectoryEvaluator\n",
+                "\n",
+                "\n",
+                "class StepNecessityEvaluator(AgentTrajectoryEvaluator):\n",
+                "    \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
+                "\n",
+                "    def __init__(self) -> None:\n",
+                "        llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
+                "        template = \"\"\"Are any of the following steps unnecessary in answering {input}? Provide the verdict on a new line as a single \"Y\" for yes or \"N\" for no.\n",
+                "\n",
+                "        DATA\n",
+                "        ------\n",
+                "        Steps: {trajectory}\n",
+                "        ------\n",
+                "\n",
+                "        Verdict:\"\"\"\n",
+                "        self.chain = LLMChain.from_string(llm, template)\n",
+                "\n",
+                "    def _evaluate_agent_trajectory(\n",
+                "        self,\n",
+                "        *,\n",
+                "        prediction: str,\n",
+                "        input: str,\n",
+                "        agent_trajectory: Sequence[Tuple[AgentAction, str]],\n",
+                "        reference: Optional[str] = None,\n",
+                "        **kwargs: Any,\n",
+                "    ) -> dict:\n",
+                "        vals = [\n",
+                "            f\"{i}: Action=[{action.tool}] returned observation = [{observation}]\"\n",
+                "            for i, (action, observation) in enumerate(agent_trajectory)\n",
+                "        ]\n",
+                "        trajectory = \"\\n\".join(vals)\n",
+                "        response = self.chain.run(dict(trajectory=trajectory, input=input), **kwargs)\n",
+                "        decision = response.split(\"\\n\")[-1].strip()\n",
+                "        score = 1 if decision == \"Y\" else 0\n",
+                "        return {\"score\": score, \"value\": decision, \"reasoning\": response}"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
+            "metadata": {},
+            "source": [
+                "The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary. It returns the string 'decision' as the 'value', and includes the rest of the generated text as 'reasoning' to let you audit the decision.\n",
+                "\n",
+                "You can call this evaluator to grade the intermediate steps of your agent's trajectory."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "a3fbcc1d-249f-4e00-8841-b6872c73c486",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1, 'value': 'Y', 'reasoning': 'Y'}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluator = StepNecessityEvaluator()\n",
+                "\n",
+                "evaluator.evaluate_agent_trajectory(\n",
+                "    prediction=\"The answer is pi\",\n",
+                "    input=\"What is today?\",\n",
+                "    agent_trajectory=[\n",
+                "        (\n",
+                "            AgentAction(tool=\"ask\", tool_input=\"What is today?\", log=\"\"),\n",
+                "            \"tomorrow's yesterday\",\n",
+                "        ),\n",
+                "        (\n",
+                "            AgentAction(tool=\"check_tv\", tool_input=\"Watch tv for half hour\", log=\"\"),\n",
+                "            \"bzzz\",\n",
+                "        ),\n",
+                "    ],\n",
+                ")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "77353528-723e-4075-939e-aebdb17c1e4f",
+            "metadata": {},
+            "source": []
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb
+++ b/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb
@ -1,304 +1,305 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "6e5ea1a1-7e74-459b-bf14-688f87d09124",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# Agent Trajectory\n",
-    "\n",
-    "Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
-    "\n",
-    "Evaluators that do this can implement the `AgentTrajectoryEvaluator` interface. This walkthrough will show how to use the `trajectory` evaluator to grade  an OpenAI functions agent.\n",
-    "\n",
-    "For more information, check out the reference docs for the [TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain) for more info."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "149402da-5212-43e2-b7c0-a701727f5293",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"trajectory\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b1c64c1a",
-   "metadata": {},
-   "source": [
-    "## Methods\n",
-    "\n",
-    "\n",
-    "The Agent Trajectory Evaluators are used with the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.evaluate_agent_trajectory) (and async [aevaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.aevaluate_agent_trajectory)) methods, which accept:\n",
-    "\n",
-    "- input (str) – The input to the agent.\n",
-    "- prediction (str) – The final predicted response.\n",
-    "- agent_trajectory (List[Tuple[AgentAction, str]]) – The intermediate steps forming the agent trajectory\n",
-    "\n",
-    "They return a dictionary with the following values:\n",
-    "- score: Float from 0 to 1, where 1 would mean \"most effective\" and 0 would mean \"least effective\"\n",
-    "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e733562c-4c17-4942-9647-acfc5ebfaca2",
-   "metadata": {},
-   "source": [
-    "## Capturing Trajectory\n",
-    "\n",
-    "The easiest way to return an agent's trajectory (without using tracing callbacks like those in LangSmith) for evaluation is to initialize the agent with `return_intermediate_steps=True`.\n",
-    "\n",
-    "Below, create an example agent we will call to evaluate."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import subprocess\n",
-    "\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.tools import tool\n",
-    "from langchain.agents import AgentType, initialize_agent\n",
-    "\n",
-    "from pydantic import HttpUrl\n",
-    "from urllib.parse import urlparse\n",
-    "\n",
-    "\n",
-    "@tool\n",
-    "def ping(url: HttpUrl, return_error: bool) -> str:\n",
-    "    \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n",
-    "    hostname = urlparse(str(url)).netloc\n",
-    "    completed_process = subprocess.run(\n",
-    "        [\"ping\", \"-c\", \"1\", hostname], capture_output=True, text=True\n",
-    "    )\n",
-    "    output = completed_process.stdout\n",
-    "    if return_error and completed_process.returncode != 0:\n",
-    "        return completed_process.stderr\n",
-    "    return output\n",
-    "\n",
-    "\n",
-    "@tool\n",
-    "def trace_route(url: HttpUrl, return_error: bool) -> str:\n",
-    "    \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n",
-    "    hostname = urlparse(str(url)).netloc\n",
-    "    completed_process = subprocess.run(\n",
-    "        [\"traceroute\", hostname], capture_output=True, text=True\n",
-    "    )\n",
-    "    output = completed_process.stdout\n",
-    "    if return_error and completed_process.returncode != 0:\n",
-    "        return completed_process.stderr\n",
-    "    return output\n",
-    "\n",
-    "\n",
-    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
-    "agent = initialize_agent(\n",
-    "    llm=llm,\n",
-    "    tools=[ping, trace_route],\n",
-    "    agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n",
-    "    return_intermediate_steps=True,  # IMPORTANT!\n",
-    ")\n",
-    "\n",
-    "result = agent(\"What's the latency like for https://langchain.com?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Evaluate Trajectory\n",
-    "\n",
-    "Pass the input, trajectory, and pass to the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator.evaluate_agent_trajectory) method."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1.0,\n",
-       " 'reasoning': \"i. The final answer is helpful. It directly answers the user's question about the latency for the website https://langchain.com.\\n\\nii. The AI language model uses a logical sequence of tools to answer the question. It uses the 'ping' tool to measure the latency of the website, which is the correct tool for this task.\\n\\niii. The AI language model uses the tool in a helpful way. It inputs the URL into the 'ping' tool and correctly interprets the output to provide the latency in milliseconds.\\n\\niv. The AI language model does not use too many steps to answer the question. It only uses one step, which is appropriate for this type of question.\\n\\nv. The appropriate tool is used to answer the question. The 'ping' tool is the correct tool to measure website latency.\\n\\nGiven these considerations, the AI language model's performance is excellent. It uses the correct tool, interprets the output correctly, and provides a helpful and direct answer to the user's question.\"}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
-    "    prediction=result[\"output\"],\n",
-    "    input=result[\"input\"],\n",
-    "    agent_trajectory=result[\"intermediate_steps\"],\n",
-    ")\n",
-    "evaluation_result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fc5467c1-ea92-405f-949a-3011388fa9ee",
-   "metadata": {},
-   "source": [
-    "## Configuring the Evaluation LLM\n",
-    "\n",
-    "If you don't select an LLM to use for evaluation, the [load_evaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.loading.load_evaluator.html#langchain.evaluation.loading.load_evaluator) function will use `gpt-4` to power the evaluation chain. You can select any chat model for the agent trajectory evaluator as below."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "1f6318f3-642a-4766-bc7a-f91239795ee7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# %pip install anthropic\n",
-    "# ANTHROPIC_API_KEY=<YOUR ANTHROPIC API KEY>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "b2852289-5df9-402e-95b5-7efebf0fc943",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatAnthropic\n",
-    "\n",
-    "eval_llm = ChatAnthropic(temperature=0)\n",
-    "evaluator = load_evaluator(\"trajectory\", llm=eval_llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "ff72d21a-93b9-4c2f-8613-733d9c9330d7",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1.0,\n",
-       " 'reasoning': \"Here is my detailed evaluation of the AI's response:\\n\\ni. The final answer is helpful, as it directly provides the latency measurement for the requested website.\\n\\nii. The sequence of using the ping tool to measure latency is logical for this question.\\n\\niii. The ping tool is used in a helpful way, with the website URL provided as input and the output latency measurement extracted.\\n\\niv. Only one step is used, which is appropriate for simply measuring latency. More steps are not needed.\\n\\nv. The ping tool is an appropriate choice to measure latency. \\n\\nIn summary, the AI uses an optimal single step approach with the right tool and extracts the needed output. The final answer directly answers the question in a helpful way.\\n\\nOverall\"}"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
-    "    prediction=result[\"output\"],\n",
-    "    input=result[\"input\"],\n",
-    "    agent_trajectory=result[\"intermediate_steps\"],\n",
-    ")\n",
-    "evaluation_result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "95ce4240-f5a0-4810-8d09-b2f4c9e18b7f",
-   "metadata": {},
-   "source": [
-    "## Providing List of Valid Tools\n",
-    "\n",
-    "By default, the evaluator doesn't take into account the tools the agent is permitted to call. You can provide these to the evaluator via the `agent_tools` argument.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "24c10566-2ef5-45c5-9213-a8fb28e2ca1f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation import load_evaluator\n",
-    "\n",
-    "evaluator = load_evaluator(\"trajectory\", agent_tools=[ping, trace_route])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "7b995786-5b78-4d9e-8e8a-1f2a203113e2",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'score': 1.0,\n",
-       " 'reasoning': \"i. The final answer is helpful. It directly answers the user's question about the latency for the specified website.\\n\\nii. The AI language model uses a logical sequence of tools to answer the question. In this case, only one tool was needed to answer the question, and the model chose the correct one.\\n\\niii. The AI language model uses the tool in a helpful way. The 'ping' tool was used to determine the latency of the website, which was the information the user was seeking.\\n\\niv. The AI language model does not use too many steps to answer the question. Only one step was needed and used.\\n\\nv. The appropriate tool was used to answer the question. The 'ping' tool is designed to measure latency, which was the information the user was seeking.\\n\\nGiven these considerations, the AI language model's performance in answering this question is excellent.\"}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
-    "    prediction=result[\"output\"],\n",
-    "    input=result[\"input\"],\n",
-    "    agent_trajectory=result[\"intermediate_steps\"],\n",
-    ")\n",
-    "evaluation_result"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "6e5ea1a1-7e74-459b-bf14-688f87d09124",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "# Agent Trajectory\n",
+                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb)\n",
+                "\n",
+                "Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
+                "\n",
+                "Evaluators that do this can implement the `AgentTrajectoryEvaluator` interface. This walkthrough will show how to use the `trajectory` evaluator to grade  an OpenAI functions agent.\n",
+                "\n",
+                "For more information, check out the reference docs for the [TrajectoryEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain) for more info."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "id": "149402da-5212-43e2-b7c0-a701727f5293",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"trajectory\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "b1c64c1a",
+            "metadata": {},
+            "source": [
+                "## Methods\n",
+                "\n",
+                "\n",
+                "The Agent Trajectory Evaluators are used with the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.evaluate_agent_trajectory) (and async [aevaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.html#langchain.evaluation.agents.trajectory_eval_chain.TrajectoryEvalChain.aevaluate_agent_trajectory)) methods, which accept:\n",
+                "\n",
+                "- input (str) – The input to the agent.\n",
+                "- prediction (str) – The final predicted response.\n",
+                "- agent_trajectory (List[Tuple[AgentAction, str]]) – The intermediate steps forming the agent trajectory\n",
+                "\n",
+                "They return a dictionary with the following values:\n",
+                "- score: Float from 0 to 1, where 1 would mean \"most effective\" and 0 would mean \"least effective\"\n",
+                "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "e733562c-4c17-4942-9647-acfc5ebfaca2",
+            "metadata": {},
+            "source": [
+                "## Capturing Trajectory\n",
+                "\n",
+                "The easiest way to return an agent's trajectory (without using tracing callbacks like those in LangSmith) for evaluation is to initialize the agent with `return_intermediate_steps=True`.\n",
+                "\n",
+                "Below, create an example agent we will call to evaluate."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 2,
+            "id": "451cb0cb-6f42-4abd-aa6d-fb871fce034d",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "import os\n",
+                "import subprocess\n",
+                "\n",
+                "from langchain.chat_models import ChatOpenAI\n",
+                "from langchain.tools import tool\n",
+                "from langchain.agents import AgentType, initialize_agent\n",
+                "\n",
+                "from pydantic import HttpUrl\n",
+                "from urllib.parse import urlparse\n",
+                "\n",
+                "\n",
+                "@tool\n",
+                "def ping(url: HttpUrl, return_error: bool) -> str:\n",
+                "    \"\"\"Ping the fully specified url. Must include https:// in the url.\"\"\"\n",
+                "    hostname = urlparse(str(url)).netloc\n",
+                "    completed_process = subprocess.run(\n",
+                "        [\"ping\", \"-c\", \"1\", hostname], capture_output=True, text=True\n",
+                "    )\n",
+                "    output = completed_process.stdout\n",
+                "    if return_error and completed_process.returncode != 0:\n",
+                "        return completed_process.stderr\n",
+                "    return output\n",
+                "\n",
+                "\n",
+                "@tool\n",
+                "def trace_route(url: HttpUrl, return_error: bool) -> str:\n",
+                "    \"\"\"Trace the route to the specified url. Must include https:// in the url.\"\"\"\n",
+                "    hostname = urlparse(str(url)).netloc\n",
+                "    completed_process = subprocess.run(\n",
+                "        [\"traceroute\", hostname], capture_output=True, text=True\n",
+                "    )\n",
+                "    output = completed_process.stdout\n",
+                "    if return_error and completed_process.returncode != 0:\n",
+                "        return completed_process.stderr\n",
+                "    return output\n",
+                "\n",
+                "\n",
+                "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
+                "agent = initialize_agent(\n",
+                "    llm=llm,\n",
+                "    tools=[ping, trace_route],\n",
+                "    agent=AgentType.OPENAI_MULTI_FUNCTIONS,\n",
+                "    return_intermediate_steps=True,  # IMPORTANT!\n",
+                ")\n",
+                "\n",
+                "result = agent(\"What's the latency like for https://langchain.com?\")"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
+            "metadata": {
+                "tags": []
+            },
+            "source": [
+                "## Evaluate Trajectory\n",
+                "\n",
+                "Pass the input, trajectory, and pass to the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator.evaluate_agent_trajectory) method."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 3,
+            "id": "8d2c8703-98ed-4068-8a8b-393f0f1f64ea",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1.0,\n",
+                            " 'reasoning': \"i. The final answer is helpful. It directly answers the user's question about the latency for the website https://langchain.com.\\n\\nii. The AI language model uses a logical sequence of tools to answer the question. It uses the 'ping' tool to measure the latency of the website, which is the correct tool for this task.\\n\\niii. The AI language model uses the tool in a helpful way. It inputs the URL into the 'ping' tool and correctly interprets the output to provide the latency in milliseconds.\\n\\niv. The AI language model does not use too many steps to answer the question. It only uses one step, which is appropriate for this type of question.\\n\\nv. The appropriate tool is used to answer the question. The 'ping' tool is the correct tool to measure website latency.\\n\\nGiven these considerations, the AI language model's performance is excellent. It uses the correct tool, interprets the output correctly, and provides a helpful and direct answer to the user's question.\"}"
+                        ]
+                    },
+                    "execution_count": 3,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
+                "    prediction=result[\"output\"],\n",
+                "    input=result[\"input\"],\n",
+                "    agent_trajectory=result[\"intermediate_steps\"],\n",
+                ")\n",
+                "evaluation_result"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "fc5467c1-ea92-405f-949a-3011388fa9ee",
+            "metadata": {},
+            "source": [
+                "## Configuring the Evaluation LLM\n",
+                "\n",
+                "If you don't select an LLM to use for evaluation, the [load_evaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.loading.load_evaluator.html#langchain.evaluation.loading.load_evaluator) function will use `gpt-4` to power the evaluation chain. You can select any chat model for the agent trajectory evaluator as below."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 4,
+            "id": "1f6318f3-642a-4766-bc7a-f91239795ee7",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "# %pip install anthropic\n",
+                "# ANTHROPIC_API_KEY=<YOUR ANTHROPIC API KEY>"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 5,
+            "id": "b2852289-5df9-402e-95b5-7efebf0fc943",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.chat_models import ChatAnthropic\n",
+                "\n",
+                "eval_llm = ChatAnthropic(temperature=0)\n",
+                "evaluator = load_evaluator(\"trajectory\", llm=eval_llm)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 6,
+            "id": "ff72d21a-93b9-4c2f-8613-733d9c9330d7",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1.0,\n",
+                            " 'reasoning': \"Here is my detailed evaluation of the AI's response:\\n\\ni. The final answer is helpful, as it directly provides the latency measurement for the requested website.\\n\\nii. The sequence of using the ping tool to measure latency is logical for this question.\\n\\niii. The ping tool is used in a helpful way, with the website URL provided as input and the output latency measurement extracted.\\n\\niv. Only one step is used, which is appropriate for simply measuring latency. More steps are not needed.\\n\\nv. The ping tool is an appropriate choice to measure latency. \\n\\nIn summary, the AI uses an optimal single step approach with the right tool and extracts the needed output. The final answer directly answers the question in a helpful way.\\n\\nOverall\"}"
+                        ]
+                    },
+                    "execution_count": 6,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
+                "    prediction=result[\"output\"],\n",
+                "    input=result[\"input\"],\n",
+                "    agent_trajectory=result[\"intermediate_steps\"],\n",
+                ")\n",
+                "evaluation_result"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "95ce4240-f5a0-4810-8d09-b2f4c9e18b7f",
+            "metadata": {},
+            "source": [
+                "## Providing List of Valid Tools\n",
+                "\n",
+                "By default, the evaluator doesn't take into account the tools the agent is permitted to call. You can provide these to the evaluator via the `agent_tools` argument.\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 7,
+            "id": "24c10566-2ef5-45c5-9213-a8fb28e2ca1f",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [],
+            "source": [
+                "from langchain.evaluation import load_evaluator\n",
+                "\n",
+                "evaluator = load_evaluator(\"trajectory\", agent_tools=[ping, trace_route])"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 8,
+            "id": "7b995786-5b78-4d9e-8e8a-1f2a203113e2",
+            "metadata": {
+                "tags": []
+            },
+            "outputs": [
+                {
+                    "data": {
+                        "text/plain": [
+                            "{'score': 1.0,\n",
+                            " 'reasoning': \"i. The final answer is helpful. It directly answers the user's question about the latency for the specified website.\\n\\nii. The AI language model uses a logical sequence of tools to answer the question. In this case, only one tool was needed to answer the question, and the model chose the correct one.\\n\\niii. The AI language model uses the tool in a helpful way. The 'ping' tool was used to determine the latency of the website, which was the information the user was seeking.\\n\\niv. The AI language model does not use too many steps to answer the question. Only one step was needed and used.\\n\\nv. The appropriate tool was used to answer the question. The 'ping' tool is designed to measure latency, which was the information the user was seeking.\\n\\nGiven these considerations, the AI language model's performance in answering this question is excellent.\"}"
+                        ]
+                    },
+                    "execution_count": 8,
+                    "metadata": {},
+                    "output_type": "execute_result"
+                }
+            ],
+            "source": [
+                "evaluation_result = evaluator.evaluate_agent_trajectory(\n",
+                "    prediction=result[\"output\"],\n",
+                "    input=result[\"input\"],\n",
+                "    agent_trajectory=result[\"intermediate_steps\"],\n",
+                ")\n",
+                "evaluation_result"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3 (ipykernel)",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.2"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/evaluation/init.py
+++ b/libs/langchain/langchain/evaluation/init.py
@ -67,8 +67,10 @@ from langchain.evaluation.embedding_distance import (
    EmbeddingDistanceEvalChain,
    PairwiseEmbeddingDistanceEvalChain,
 )
+from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
 from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
+from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
 from langchain.evaluation.schema import (
    AgentTrajectoryEvaluator,
    EvaluatorType,
@ -83,6 +85,8 @@ from langchain.evaluation.string_distance import (

 __all__ = [
    "EvaluatorType",
+    "ExactMatchStringEvaluator",
+    "RegexMatchStringEvaluator",
    "PairwiseStringEvalChain",
    "LabeledPairwiseStringEvalChain",
    "QAEvalChain",
--- a/libs/langchain/langchain/evaluation/exact_match/init.py
+++ b/libs/langchain/langchain/evaluation/exact_match/init.py
--- a/libs/langchain/langchain/evaluation/exact_match/base.py
+++ b/libs/langchain/langchain/evaluation/exact_match/base.py
@ -0,0 +1,97 @@
+import string
+from typing import Any, List
+
+from langchain.evaluation.schema import StringEvaluator
+
+
+class ExactMatchStringEvaluator(StringEvaluator):
+    """Compute an exact match between the prediction and the reference.
+
+    Examples
+    ----------
+    >>> evaluator = ExactMatchChain()
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="Mindy is the CTO",
+        )  # This will return {'score': 1.0}
+
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="Mindy is the CEO",
+        )  # This will return {'score': 0.0}
+    """
+
+    def __init__(
+        self,
+        *,
+        ignore_case: bool = False,
+        ignore_punctuation: bool = False,
+        ignore_numbers: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        self.ignore_case = ignore_case
+        self.ignore_punctuation = ignore_punctuation
+        self.ignore_numbers = ignore_numbers
+
+    @property
+    def requires_input(self) -> bool:
+        """
+        This evaluator does not require input.
+        """
+        return False
+
+    @property
+    def requires_reference(self) -> bool:
+        """
+        This evaluator requires a reference.
+        """
+        return True
+
+    @property
+    def input_keys(self) -> List[str]:
+        """
+        Get the input keys.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["reference", "prediction"]
+
+    @property
+    def evaluation_name(self) -> str:
+        """
+        Get the evaluation name.
+
+        Returns:
+            str: The evaluation name.
+        """
+        return "exact_match"
+
+    def _evaluate_strings(  # type: ignore[arg-type,override]
+        self,
+        *,
+        prediction: str,
+        reference: str,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Evaluate the exact match between the prediction and the reference.
+
+        Args:
+            prediction (str): The prediction string.
+            reference (Optional[str], optional): The reference string.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        if self.ignore_case:
+            prediction = prediction.lower()
+            reference = reference.lower()
+        if self.ignore_punctuation:
+            prediction = prediction.translate(str.maketrans("", "", string.punctuation))
+            reference = reference.translate(str.maketrans("", "", string.punctuation))
+        if self.ignore_numbers:
+            prediction = prediction.translate(str.maketrans("", "", string.digits))
+            reference = reference.translate(str.maketrans("", "", string.digits))
+        return {"score": int(prediction == reference)}
--- a/libs/langchain/langchain/evaluation/loading.py
+++ b/libs/langchain/langchain/evaluation/loading.py
@ -14,11 +14,13 @@ from langchain.evaluation.embedding_distance.base import (
    EmbeddingDistanceEvalChain,
    PairwiseEmbeddingDistanceEvalChain,
 )
+from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
 from langchain.evaluation.parsing.base import (
    JsonEqualityEvaluator,
    JsonValidityEvaluator,
 )
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
+from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
 from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
 from langchain.evaluation.string_distance.base import (
    PairwiseStringDistanceEvalChain,
@ -78,6 +80,8 @@ _EVALUATOR_MAP: Dict[
    EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
    EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
    EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
+    EvaluatorType.REGEX_MATCH: RegexMatchStringEvaluator,
+    EvaluatorType.EXACT_MATCH: ExactMatchStringEvaluator,
 }


@ -111,7 +115,7 @@ def load_evaluator(
    if evaluator not in _EVALUATOR_MAP:
        raise ValueError(
            f"Unknown evaluator type: {evaluator}"
-            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
+            f"\nValid types are: {list(_EVALUATOR_MAP.keys())}"
        )
    evaluator_cls = _EVALUATOR_MAP[evaluator]
    if issubclass(evaluator_cls, LLMEvalChain):
--- a/libs/langchain/langchain/evaluation/regex_match/init.py
+++ b/libs/langchain/langchain/evaluation/regex_match/init.py
--- a/libs/langchain/langchain/evaluation/regex_match/base.py
+++ b/libs/langchain/langchain/evaluation/regex_match/base.py
@ -0,0 +1,86 @@
+import re
+from typing import Any, List
+
+from langchain.evaluation.schema import StringEvaluator
+
+
+class RegexMatchStringEvaluator(StringEvaluator):
+    """Compute a regex match between the prediction and the reference.
+
+    Examples
+    ----------
+    >>> evaluator = RegexMatchStringEvaluator(flags=re.IGNORECASE)
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="^mindy.*cto$",
+        )  # This will return {'score': 1.0} due to the IGNORECASE flag
+
+    >>> evaluator = RegexMatchStringEvaluator()
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="^Mike.*CEO$",
+        )  # This will return {'score': 0.0}
+
+    >>> evaluator.evaluate_strings(
+            prediction="Mindy is the CTO",
+            reference="^Mike.*CEO$|^Mindy.*CTO$",
+        )  # This will return {'score': 1.0} as the prediction matches the second pattern in the union
+    """  # noqa: E501
+
+    def __init__(self, *, flags: int = 0, **kwargs: Any):  # Default is no flags
+        super().__init__()
+        self.flags = flags
+
+    @property
+    def requires_input(self) -> bool:
+        """
+        This evaluator does not require input.
+        """
+        return False
+
+    @property
+    def requires_reference(self) -> bool:
+        """
+        This evaluator requires a reference.
+        """
+        return True
+
+    @property
+    def input_keys(self) -> List[str]:
+        """
+        Get the input keys.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["reference", "prediction"]
+
+    @property
+    def evaluation_name(self) -> str:
+        """
+        Get the evaluation name.
+
+        Returns:
+            str: The evaluation name.
+        """
+        return "regex_match"
+
+    def _evaluate_strings(  # type: ignore[arg-type,override]
+        self,
+        *,
+        prediction: str,
+        reference: str,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Evaluate the regex match between the prediction and the reference.
+
+        Args:
+            prediction (str): The prediction string.
+            reference (Optional[str], optional): The reference regex pattern.
+
+        Returns:
+            dict: The evaluation results containing the score.
+        """
+        match = re.match(reference, prediction, flags=self.flags)
+        return {"score": int(bool(match))}
--- a/libs/langchain/langchain/evaluation/schema.py
+++ b/libs/langchain/langchain/evaluation/schema.py
@ -44,6 +44,10 @@ class EvaluatorType(str, Enum):
    custom set of criteria, with a reference label."""
    STRING_DISTANCE = "string_distance"
    """Compare predictions to a reference answer using string edit distances."""
+    EXACT_MATCH = "exact_match"
+    """Compare predictions to a reference answer using exact matching."""
+    REGEX_MATCH = "regex_match"
+    """Compare predictions to a reference answer using regular expressions."""
    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
    """Compare predictions based on string edit distances."""
    EMBEDDING_DISTANCE = "embedding_distance"
--- a/libs/langchain/langchain/smith/evaluation/config.py
+++ b/libs/langchain/langchain/smith/evaluation/config.py
@ -261,4 +261,34 @@ class RunEvalConfig(BaseModel):

        evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY

+    class ExactMatch(EvalConfig):
+        """Configuration for an exact match string evaluator.
+
+        Parameters
+        ----------
+        ignore_case : bool
+            Whether to ignore case when comparing strings.
+        ignore_punctuation : bool
+            Whether to ignore punctuation when comparing strings.
+        ignore_numbers : bool
+            Whether to ignore numbers when comparing strings.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
+        ignore_case: bool = False
+        ignore_punctuation: bool = False
+        ignore_numbers: bool = False
+
+    class RegexMatch(EvalConfig):
+        """Configuration for a regex match string evaluator.
+
+        Parameters
+        ----------
+        flags : int
+            The flags to pass to the regex. Example: re.IGNORECASE.
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
+        flags: int = 0
+
    # TODO: Trajectory
--- a/libs/langchain/tests/unit_tests/evaluation/exact_match/init.py
+++ b/libs/langchain/tests/unit_tests/evaluation/exact_match/init.py
--- a/libs/langchain/tests/unit_tests/evaluation/exact_match/test_base.py
+++ b/libs/langchain/tests/unit_tests/evaluation/exact_match/test_base.py
@ -0,0 +1,49 @@
+import pytest
+
+from langchain.evaluation import ExactMatchStringEvaluator
+
+
+@pytest.fixture
+def exact_match_string_evaluator() -> ExactMatchStringEvaluator:
+    """Create an ExactMatchStringEvaluator with default configuration."""
+    return ExactMatchStringEvaluator()
+
+
+@pytest.fixture
+def exact_match_string_evaluator_ignore_case() -> ExactMatchStringEvaluator:
+    """Create an ExactMatchStringEvaluator with ignore_case set to True."""
+    return ExactMatchStringEvaluator(ignore_case=True)
+
+
+def test_default_exact_matching(
+    exact_match_string_evaluator: ExactMatchStringEvaluator,
+) -> None:
+    prediction = "Mindy is the CTO"
+    reference = "Mindy is the CTO"
+    result = exact_match_string_evaluator.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 1.0
+
+    reference = "Mindy is the CEO"
+    result = exact_match_string_evaluator.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 0.0
+
+
+def test_exact_matching_with_ignore_case(
+    exact_match_string_evaluator_ignore_case: ExactMatchStringEvaluator,
+) -> None:
+    prediction = "Mindy is the CTO"
+    reference = "mindy is the cto"
+    result = exact_match_string_evaluator_ignore_case.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 1.0
+
+    reference = "mindy is the CEO"
+    result = exact_match_string_evaluator_ignore_case.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 0.0
--- a/libs/langchain/tests/unit_tests/evaluation/regex_match/test_base.py
+++ b/libs/langchain/tests/unit_tests/evaluation/regex_match/test_base.py
@ -0,0 +1,45 @@
+import re
+
+import pytest
+
+from langchain.evaluation import RegexMatchStringEvaluator
+
+
+@pytest.fixture
+def regex_match_string_evaluator() -> RegexMatchStringEvaluator:
+    """Create a RegexMatchStringEvaluator with default configuration."""
+    return RegexMatchStringEvaluator()
+
+
+@pytest.fixture
+def regex_match_string_evaluator_ignore_case() -> RegexMatchStringEvaluator:
+    """Create a RegexMatchStringEvaluator with IGNORECASE flag."""
+    return RegexMatchStringEvaluator(flags=re.IGNORECASE)
+
+
+def test_default_regex_matching(
+    regex_match_string_evaluator: RegexMatchStringEvaluator,
+) -> None:
+    prediction = "Mindy is the CTO"
+    reference = "^Mindy.*CTO$"
+    result = regex_match_string_evaluator.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 1.0
+
+    reference = "^Mike.*CEO$"
+    result = regex_match_string_evaluator.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 0.0
+
+
+def test_regex_matching_with_ignore_case(
+    regex_match_string_evaluator_ignore_case: RegexMatchStringEvaluator,
+) -> None:
+    prediction = "Mindy is the CTO"
+    reference = "^mindy.*cto$"
+    result = regex_match_string_evaluator_ignore_case.evaluate_strings(
+        prediction=prediction, reference=reference
+    )
+    assert result["score"] == 1.0
--- a/libs/langchain/tests/unit_tests/evaluation/test_loading.py
+++ b/libs/langchain/tests/unit_tests/evaluation/test_loading.py
@ -41,6 +41,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
            EvaluatorType.LABELED_PAIRWISE_STRING,
        ],
        [EvaluatorType.JSON_EQUALITY],
+        [EvaluatorType.EXACT_MATCH, EvaluatorType.REGEX_MATCH],
    ],
 )
 def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None: