Add Exact match and Regex Match Evaluators (#11132)

12 months ago · 33da8bd711
parent e355606b11
commit 33da8bd711
25 changed files with 3641 additions and 3211 deletions
--- a/docs/extras/guides/evaluation/comparison/custom.ipynb
+++ b/docs/extras/guides/evaluation/comparison/custom.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# Custom Pairwise Evaluator\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/custom.ipynb)\n",
                "\n",
                "You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
                "\n",
--- a/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb
+++ b/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb
@ -8,6 +8,7 @@
            },
            "source": [
                "# Pairwise Embedding Distance \n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb)\n",
                "\n",
                "One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
                "\n",
--- a/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb
+++ b/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# Pairwise String Comparison\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_string.ipynb)\n",
                "\n",
                "Often you will want to compare predictions of an LLM, Chain, or Agent for a given input. The `StringComparison` evaluators facilitate this so you can answer questions like:\n",
                "\n",
--- a/docs/extras/guides/evaluation/examples/comparisons.ipynb
+++ b/docs/extras/guides/evaluation/examples/comparisons.ipynb
@ -5,6 +5,7 @@
            "metadata": {},
            "source": [
                "# Comparing Chain Outputs\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/examples/comparisons.ipynb)\n",
                "\n",
                "Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
                "\n",
--- a/docs/extras/guides/evaluation/string/Untitled.ipynb
+++ b/docs/extras/guides/evaluation/string/Untitled.ipynb
@ -1,318 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bce7335e-f3b2-44f3-90cc-8c0a23a89a21",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from langchain.agents import load_tools\n",
    "from langchain.agents import initialize_agent\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.utilities import GoogleSearchAPIWrapper\n",
    "from langchain.schema import (\n",
    "    SystemMessage,\n",
    "    HumanMessage,\n",
    "    AIMessage\n",
    ")\n",
    "\n",
    "# os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"******\"\n",
    "# os.environ[\"LANGCHAIN_PROJECT\"] = \"Jarvis\"\n",
    "\n",
    "\n",
    "prefix_messages = [{\"role\": \"system\", \"content\": \"You are a helpful discord Chatbot.\"}]\n",
    "\n",
    "llm = ChatOpenAI(model_name='gpt-3.5-turbo', \n",
    "             temperature=0.5, \n",
    "             max_tokens = 2000)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
    "agent = initialize_agent(tools,\n",
    "                         llm,\n",
    "                         agent=\"zero-shot-react-description\",\n",
    "                         verbose=True,\n",
    "                         handle_parsing_errors=True\n",
    "                         )\n",
    "\n",
    "\n",
    "async def on_ready():\n",
    "    print(f'{bot.user} has connected to Discord!')\n",
    "\n",
    "async def on_message(message):\n",
    "\n",
    "    print(\"Detected bot name in message:\", message.content)\n",
    "\n",
    "    # Capture the output of agent.run() in the response variable\n",
    "    response = agent.run(message.content)\n",
    "\n",
    "    while response:\n",
    "        print(response)\n",
    "        chunk, response = response[:2000], response[2000:]\n",
    "        print(f\"Chunk: {chunk}\")\n",
    "    print(\"Response sent.\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1551ce9f-b6de-4035-b6d6-825722823b48",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "@dataclass\n",
    "class Message:\n",
    "    content: str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6e6859ec-8544-4407-9663-6b53c0092903",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Detected bot name in message: Hi AI, how are you today?\n",
      "\n",
      "\n",
      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
      "\u001b[32;1m\u001b[1;3mThis question is not something that can be answered using the available tools.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
      "Action: N/A\u001b[0m\n",
      "Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
      "Thought:\u001b[32;1m\u001b[1;3m\u001b[0m\n",
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n",
      "Agent stopped due to iteration limit or time limit.\n",
      "Chunk: Agent stopped due to iteration limit or time limit.\n",
      "Response sent.\n"
     ]
    }
   ],
   "source": [
    "await on_message(Message(content=\"Hi AI, how are you today?\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "b850294c-7f8f-4e79-adcf-47e4e3a898df",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langsmith import Client\n",
    "\n",
    "client = Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "6d089ddc-69bc-45a8-b8db-9962e4f1f5ee",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from itertools import islice\n",
    "\n",
    "runs = list(islice(client.list_runs(), 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "f0349fac-5a98-400f-ba03-61ed4e1332be",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "runs = sorted(runs, key=lambda x: x.start_time, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "02f133f0-39ee-4b46-b443-12c1f9b76fff",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "ids = [run.id for run in runs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "3366dce4-0c38-4a7d-8111-046a58b24917",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "runs2 = list(client.list_runs(id=ids))\n",
    "runs2 = sorted(runs2, key=lambda x: x.start_time, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "82915b90-39a0-47d6-9121-56a13f210f52",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a36092d2-4ad5-4fb4-9b0d-0dba9a2ed836',\n",
       " '9398e6be-964f-4aa4-8de9-ad78cd4b7074']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[str(x) for x in ids[:2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "f610ec91-dc48-4a17-91c5-5c4675c77abc",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langsmith.run_helpers import traceable\n",
    "\n",
    "@traceable(run_type=\"llm\", name=\"\"\"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/dQw4w9WgXcQ?start=5\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" allowfullscreen></iframe>\"\"\")\n",
    "def foo():\n",
    "    return \"bar\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "bd317bd7-8b2a-433a-8ec3-098a84ba8e64",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bar'"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "foo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "b142519b-6885-415c-83b9-4a346fb90589",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from langchain.llms import AzureOpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c50bb2b-72b8-4322-9b16-d857ecd9f347",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb
+++ b/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# Criteria Evaluation\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/criteria_eval_chain.ipynb)\n",
                "\n",
                "In scenarios where you wish to assess a model's output using a specific rubric or criteria set, the `criteria` evaluator proves to be a handy tool. It allows you to verify if an LLM or Chain's output complies with a defined set of criteria.\n",
                "\n",
--- a/docs/extras/guides/evaluation/string/custom.ipynb
+++ b/docs/extras/guides/evaluation/string/custom.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# Custom String Evaluator\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/custom.ipynb)\n",
                "\n",
                "You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
                "\n",
--- a/docs/extras/guides/evaluation/string/embedding_distance.ipynb
+++ b/docs/extras/guides/evaluation/string/embedding_distance.ipynb
@ -7,6 +7,7 @@
            },
            "source": [
                "# Embedding Distance\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/embedding_distance.ipynb)\n",
                "\n",
                "To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
                "\n",
--- a/docs/extras/guides/evaluation/string/exact_match.ipynb
+++ b/docs/extras/guides/evaluation/string/exact_match.ipynb
@ -0,0 +1,175 @@
 {
    "cells": [
        {
            "cell_type": "markdown",
            "id": "2da95378",
            "metadata": {},
            "source": [
                "# Exact Match\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/exact_match.ipynb)\n",
                "\n",
                "Probably the simplest ways to evaluate an LLM or runnable's string output against a reference label is by a simple string equivalence.\n",
                "\n",
                "This can be accessed using the `exact_match` evaluator."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
            "metadata": {},
            "outputs": [],
            "source": [
                "from langchain.evaluation import ExactMatchStringEvaluator\n",
                "\n",
                "evaluator = ExactMatchStringEvaluator()"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
            "metadata": {},
            "source": [
                "Alternatively via the loader:"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "id": "f6790c46",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "from langchain.evaluation import load_evaluator\n",
                "\n",
                "evaluator = load_evaluator(\"exact_match\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "49ad9139",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 0}"
                        ]
                    },
                    "execution_count": 3,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "evaluator.evaluate_strings(\n",
                "    prediction=\"1 LLM.\",\n",
                "    reference=\"2 llm\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 0}"
                        ]
                    },
                    "execution_count": 4,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "evaluator.evaluate_strings(\n",
                "    prediction=\"LangChain\",\n",
                "    reference=\"langchain\",\n",
                ")"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
            "metadata": {},
            "source": [
                "## Configure the ExactMatchStringEvaluator\n",
                "\n",
                "You can relax the \"exactness\" when comparing strings."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "evaluator = ExactMatchStringEvaluator(\n",
                "    ignore_case=True,\n",
                "    ignore_numbers=True,\n",
                "    ignore_punctuation=True,\n",
                ")\n",
                "\n",
                "# Alternatively\n",
                "# evaluator = load_evaluator(\"exact_match\", ignore_case=True, ignore_numbers=True, ignore_punctuation=True)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 1}"
                        ]
                    },
                    "execution_count": 6,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "evaluator.evaluate_strings(\n",
                "    prediction=\"1 LLM.\",\n",
                "    reference=\"2 llm\",\n",
                ")"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.11.2"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
 }
--- a/docs/extras/guides/evaluation/string/regex_match.ipynb
+++ b/docs/extras/guides/evaluation/string/regex_match.ipynb
@ -0,0 +1,243 @@
 {
    "cells": [
        {
            "cell_type": "markdown",
            "id": "2da95378",
            "metadata": {},
            "source": [
                "# Regex Match\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/regex_match.ipynb)\n",
                "\n",
                "To evaluate chain or runnable string predictions against a custom regex, you can use the `regex_match` evaluator."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 1,
            "id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
            "metadata": {},
            "outputs": [],
            "source": [
                "from langchain.evaluation import RegexMatchStringEvaluator\n",
                "\n",
                "evaluator = RegexMatchStringEvaluator()"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
            "metadata": {},
            "source": [
                "Alternatively via the loader:"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 2,
            "id": "f6790c46",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "from langchain.evaluation import load_evaluator\n",
                "\n",
                "evaluator = load_evaluator(\"regex_match\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "49ad9139",
            "metadata": {
                "tags": []
            },
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 1}"
                        ]
                    },
                    "execution_count": 3,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# Check for the presence of a YYYY-MM-DD string.\n",
                "evaluator.evaluate_strings(\n",
                "    prediction=\"The delivery will be made on 2024-01-05\",\n",
                "    reference=\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\"\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 0}"
                        ]
                    },
                    "execution_count": 4,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# Check for the presence of a MM-DD-YYYY string.\n",
                "evaluator.evaluate_strings(\n",
                "    prediction=\"The delivery will be made on 2024-01-05\",\n",
                "    reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "id": "168fcd92-dffb-4345-b097-02d0fedf52fd",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 1}"
                        ]
                    },
                    "execution_count": 5,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# Check for the presence of a MM-DD-YYYY string.\n",
                "evaluator.evaluate_strings(\n",
                "    prediction=\"The delivery will be made on 01-05-2024\",\n",
                "    reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
                ")"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "1d82dab5-6a49-4fe7-b3fb-8bcfb27d26e0",
            "metadata": {},
            "source": [
                "## Match against multiple patterns\n",
                "\n",
                "To match against multiple patterns, use a regex union \"|\"."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "id": "b87b915e-b7c2-476b-a452-99688a22293a",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 1}"
                        ]
                    },
                    "execution_count": 6,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# Check for the presence of a MM-DD-YYYY string or YYYY-MM-DD\n",
                "evaluator.evaluate_strings(\n",
                "    prediction=\"The delivery will be made on 01-05-2024\",\n",
                "    reference=\"|\".join([\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\", \".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"])\n",
                ")"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
            "metadata": {},
            "source": [
                "## Configure the RegexMatchStringEvaluator\n",
                "\n",
                "You can specify any regex flags to use when matching."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
            "metadata": {
                "tags": []
            },
            "outputs": [],
            "source": [
                "import re\n",
                "\n",
                "evaluator = RegexMatchStringEvaluator(\n",
                "    flags=re.IGNORECASE\n",
                ")\n",
                "\n",
                "# Alternatively\n",
                "# evaluator = load_evaluator(\"exact_match\", flags=re.IGNORECASE)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "{'score': 1}"
                        ]
                    },
                    "execution_count": 8,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "evaluator.evaluate_strings(\n",
                "    prediction=\"I LOVE testing\",\n",
                "    reference=\"I love testing\",\n",
                ")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "82de8d3e-c829-440e-a582-3fb70cecad3b",
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3 (ipykernel)",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.11.2"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
 }
--- a/docs/extras/guides/evaluation/string/string_distance.ipynb
+++ b/docs/extras/guides/evaluation/string/string_distance.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# String Distance\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/string_distance.ipynb)\n",
                "\n",
                "One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance.  This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
                "\n",
--- a/docs/extras/guides/evaluation/trajectory/custom.ipynb
+++ b/docs/extras/guides/evaluation/trajectory/custom.ipynb
@ -6,6 +6,7 @@
            "metadata": {},
            "source": [
                "# Custom Trajectory Evaluator\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/custom.ipynb)\n",
                "\n",
                "You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
                "\n",
--- a/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb
+++ b/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb
@ -8,6 +8,7 @@
            },
            "source": [
                "# Agent Trajectory\n",
                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/trajectory_eval.ipynb)\n",
                "\n",
                "Agents can be difficult to holistically evaluate due to the breadth of actions and generation they can make. We recommend using multiple evaluation techniques appropriate to your use case. One way to evaluate an agent is to look at the whole trajectory of actions taken along with their responses.\n",
                "\n",
--- a/libs/langchain/langchain/evaluation/init.py
+++ b/libs/langchain/langchain/evaluation/init.py
@ -67,8 +67,10 @@ from langchain.evaluation.embedding_distance import (
    EmbeddingDistanceEvalChain,
    PairwiseEmbeddingDistanceEvalChain,
 )
 from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
 from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
 from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
 from langchain.evaluation.schema import (
    AgentTrajectoryEvaluator,
    EvaluatorType,
@ -83,6 +85,8 @@ from langchain.evaluation.string_distance import (
 __all__ = [
    "EvaluatorType",
    "ExactMatchStringEvaluator",
    "RegexMatchStringEvaluator",
    "PairwiseStringEvalChain",
    "LabeledPairwiseStringEvalChain",
    "QAEvalChain",
--- a/libs/langchain/langchain/evaluation/exact_match/init.py
+++ b/libs/langchain/langchain/evaluation/exact_match/init.py
--- a/libs/langchain/langchain/evaluation/exact_match/base.py
+++ b/libs/langchain/langchain/evaluation/exact_match/base.py
@ -0,0 +1,97 @@
 import string
 from typing import Any, List
 from langchain.evaluation.schema import StringEvaluator
 class ExactMatchStringEvaluator(StringEvaluator):
    """Compute an exact match between the prediction and the reference.
    Examples
    ----------
    >>> evaluator = ExactMatchChain()
    >>> evaluator.evaluate_strings(
            prediction="Mindy is the CTO",
            reference="Mindy is the CTO",
        )  # This will return {'score': 1.0}
    >>> evaluator.evaluate_strings(
            prediction="Mindy is the CTO",
            reference="Mindy is the CEO",
        )  # This will return {'score': 0.0}
    """
    def __init__(
        self,
        *,
        ignore_case: bool = False,
        ignore_punctuation: bool = False,
        ignore_numbers: bool = False,
        **kwargs: Any,
    ):
        super().__init__()
        self.ignore_case = ignore_case
        self.ignore_punctuation = ignore_punctuation
        self.ignore_numbers = ignore_numbers
    @property
    def requires_input(self) -> bool:
        """
        This evaluator does not require input.
        """
        return False
    @property
    def requires_reference(self) -> bool:
        """
        This evaluator requires a reference.
        """
        return True
    @property
    def input_keys(self) -> List[str]:
        """
        Get the input keys.
        Returns:
            List[str]: The input keys.
        """
        return ["reference", "prediction"]
    @property
    def evaluation_name(self) -> str:
        """
        Get the evaluation name.
        Returns:
            str: The evaluation name.
        """
        return "exact_match"
    def _evaluate_strings(  # type: ignore[arg-type,override]
        self,
        *,
        prediction: str,
        reference: str,
        **kwargs: Any,
    ) -> dict:
        """
        Evaluate the exact match between the prediction and the reference.
        Args:
            prediction (str): The prediction string.
            reference (Optional[str], optional): The reference string.
        Returns:
            dict: The evaluation results containing the score.
        """
        if self.ignore_case:
            prediction = prediction.lower()
            reference = reference.lower()
        if self.ignore_punctuation:
            prediction = prediction.translate(str.maketrans("", "", string.punctuation))
            reference = reference.translate(str.maketrans("", "", string.punctuation))
        if self.ignore_numbers:
            prediction = prediction.translate(str.maketrans("", "", string.digits))
            reference = reference.translate(str.maketrans("", "", string.digits))
        return {"score": int(prediction == reference)}
--- a/libs/langchain/langchain/evaluation/loading.py
+++ b/libs/langchain/langchain/evaluation/loading.py
@ -14,11 +14,13 @@ from langchain.evaluation.embedding_distance.base import (
    EmbeddingDistanceEvalChain,
    PairwiseEmbeddingDistanceEvalChain,
 )
 from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator
 from langchain.evaluation.parsing.base import (
    JsonEqualityEvaluator,
    JsonValidityEvaluator,
 )
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
 from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator
 from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
 from langchain.evaluation.string_distance.base import (
    PairwiseStringDistanceEvalChain,
@ -78,6 +80,8 @@ _EVALUATOR_MAP: Dict[
    EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
    EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
    EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
    EvaluatorType.REGEX_MATCH: RegexMatchStringEvaluator,
    EvaluatorType.EXACT_MATCH: ExactMatchStringEvaluator,
 }
@ -111,7 +115,7 @@ def load_evaluator(
    if evaluator not in _EVALUATOR_MAP:
        raise ValueError(
            f"Unknown evaluator type: {evaluator}"
-            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
+            f"\nValid types are: {list(_EVALUATOR_MAP.keys())}"
        )
    evaluator_cls = _EVALUATOR_MAP[evaluator]
    if issubclass(evaluator_cls, LLMEvalChain):
--- a/libs/langchain/langchain/evaluation/regex_match/init.py
+++ b/libs/langchain/langchain/evaluation/regex_match/init.py
--- a/libs/langchain/langchain/evaluation/regex_match/base.py
+++ b/libs/langchain/langchain/evaluation/regex_match/base.py
@ -0,0 +1,86 @@
 import re
 from typing import Any, List
 from langchain.evaluation.schema import StringEvaluator
 class RegexMatchStringEvaluator(StringEvaluator):
    """Compute a regex match between the prediction and the reference.
    Examples
    ----------
    >>> evaluator = RegexMatchStringEvaluator(flags=re.IGNORECASE)
    >>> evaluator.evaluate_strings(
            prediction="Mindy is the CTO",
            reference="^mindy.*cto$",
        )  # This will return {'score': 1.0} due to the IGNORECASE flag
    >>> evaluator = RegexMatchStringEvaluator()
    >>> evaluator.evaluate_strings(
            prediction="Mindy is the CTO",
            reference="^Mike.*CEO$",
        )  # This will return {'score': 0.0}
    >>> evaluator.evaluate_strings(
            prediction="Mindy is the CTO",
            reference="^Mike.*CEO$|^Mindy.*CTO$",
        )  # This will return {'score': 1.0} as the prediction matches the second pattern in the union
    """  # noqa: E501
    def __init__(self, *, flags: int = 0, **kwargs: Any):  # Default is no flags
        super().__init__()
        self.flags = flags
    @property
    def requires_input(self) -> bool:
        """
        This evaluator does not require input.
        """
        return False
    @property
    def requires_reference(self) -> bool:
        """
        This evaluator requires a reference.
        """
        return True
    @property
    def input_keys(self) -> List[str]:
        """
        Get the input keys.
        Returns:
            List[str]: The input keys.
        """
        return ["reference", "prediction"]
    @property
    def evaluation_name(self) -> str:
        """
        Get the evaluation name.
        Returns:
            str: The evaluation name.
        """
        return "regex_match"
    def _evaluate_strings(  # type: ignore[arg-type,override]
        self,
        *,
        prediction: str,
        reference: str,
        **kwargs: Any,
    ) -> dict:
        """
        Evaluate the regex match between the prediction and the reference.
        Args:
            prediction (str): The prediction string.
            reference (Optional[str], optional): The reference regex pattern.
        Returns:
            dict: The evaluation results containing the score.
        """
        match = re.match(reference, prediction, flags=self.flags)
        return {"score": int(bool(match))}
--- a/libs/langchain/langchain/evaluation/schema.py
+++ b/libs/langchain/langchain/evaluation/schema.py
@ -44,6 +44,10 @@ class EvaluatorType(str, Enum):
    custom set of criteria, with a reference label."""
    STRING_DISTANCE = "string_distance"
    """Compare predictions to a reference answer using string edit distances."""
    EXACT_MATCH = "exact_match"
    """Compare predictions to a reference answer using exact matching."""
    REGEX_MATCH = "regex_match"
    """Compare predictions to a reference answer using regular expressions."""
    PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
    """Compare predictions based on string edit distances."""
    EMBEDDING_DISTANCE = "embedding_distance"
--- a/libs/langchain/langchain/smith/evaluation/config.py
+++ b/libs/langchain/langchain/smith/evaluation/config.py
@ -261,4 +261,34 @@ class RunEvalConfig(BaseModel):
        evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
    class ExactMatch(EvalConfig):
        """Configuration for an exact match string evaluator.
        Parameters
        ----------
        ignore_case : bool
            Whether to ignore case when comparing strings.
        ignore_punctuation : bool
            Whether to ignore punctuation when comparing strings.
        ignore_numbers : bool
            Whether to ignore numbers when comparing strings.
        """
        evaluator_type: EvaluatorType = EvaluatorType.STRING_DISTANCE
        ignore_case: bool = False
        ignore_punctuation: bool = False
        ignore_numbers: bool = False
    class RegexMatch(EvalConfig):
        """Configuration for a regex match string evaluator.
        Parameters
        ----------
        flags : int
            The flags to pass to the regex. Example: re.IGNORECASE.
        """
        evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
        flags: int = 0
    # TODO: Trajectory
--- a/libs/langchain/tests/unit_tests/evaluation/exact_match/init.py
+++ b/libs/langchain/tests/unit_tests/evaluation/exact_match/init.py
--- a/libs/langchain/tests/unit_tests/evaluation/exact_match/test_base.py
+++ b/libs/langchain/tests/unit_tests/evaluation/exact_match/test_base.py
@ -0,0 +1,49 @@
 import pytest
 from langchain.evaluation import ExactMatchStringEvaluator
@pytest.fixture
 def exact_match_string_evaluator() -> ExactMatchStringEvaluator:
    """Create an ExactMatchStringEvaluator with default configuration."""
    return ExactMatchStringEvaluator()
@pytest.fixture
 def exact_match_string_evaluator_ignore_case() -> ExactMatchStringEvaluator:
    """Create an ExactMatchStringEvaluator with ignore_case set to True."""
    return ExactMatchStringEvaluator(ignore_case=True)
 def test_default_exact_matching(
    exact_match_string_evaluator: ExactMatchStringEvaluator,
 ) -> None:
    prediction = "Mindy is the CTO"
    reference = "Mindy is the CTO"
    result = exact_match_string_evaluator.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 1.0
    reference = "Mindy is the CEO"
    result = exact_match_string_evaluator.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 0.0
 def test_exact_matching_with_ignore_case(
    exact_match_string_evaluator_ignore_case: ExactMatchStringEvaluator,
 ) -> None:
    prediction = "Mindy is the CTO"
    reference = "mindy is the cto"
    result = exact_match_string_evaluator_ignore_case.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 1.0
    reference = "mindy is the CEO"
    result = exact_match_string_evaluator_ignore_case.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 0.0
--- a/libs/langchain/tests/unit_tests/evaluation/regex_match/test_base.py
+++ b/libs/langchain/tests/unit_tests/evaluation/regex_match/test_base.py
@ -0,0 +1,45 @@
 import re
 import pytest
 from langchain.evaluation import RegexMatchStringEvaluator
@pytest.fixture
 def regex_match_string_evaluator() -> RegexMatchStringEvaluator:
    """Create a RegexMatchStringEvaluator with default configuration."""
    return RegexMatchStringEvaluator()
@pytest.fixture
 def regex_match_string_evaluator_ignore_case() -> RegexMatchStringEvaluator:
    """Create a RegexMatchStringEvaluator with IGNORECASE flag."""
    return RegexMatchStringEvaluator(flags=re.IGNORECASE)
 def test_default_regex_matching(
    regex_match_string_evaluator: RegexMatchStringEvaluator,
 ) -> None:
    prediction = "Mindy is the CTO"
    reference = "^Mindy.*CTO$"
    result = regex_match_string_evaluator.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 1.0
    reference = "^Mike.*CEO$"
    result = regex_match_string_evaluator.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 0.0
 def test_regex_matching_with_ignore_case(
    regex_match_string_evaluator_ignore_case: RegexMatchStringEvaluator,
 ) -> None:
    prediction = "Mindy is the CTO"
    reference = "^mindy.*cto$"
    result = regex_match_string_evaluator_ignore_case.evaluate_strings(
        prediction=prediction, reference=reference
    )
    assert result["score"] == 1.0
--- a/libs/langchain/tests/unit_tests/evaluation/test_loading.py
+++ b/libs/langchain/tests/unit_tests/evaluation/test_loading.py
@ -41,6 +41,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
            EvaluatorType.LABELED_PAIRWISE_STRING,
        ],
        [EvaluatorType.JSON_EQUALITY],
        [EvaluatorType.EXACT_MATCH, EvaluatorType.REGEX_MATCH],
    ],
 )
 def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None: