Docs Nits (#7874)

Add links to reference docs
1 year ago · c6f2d27789
parent 3179ee3a56
commit c6f2d27789
8 changed files with 66 additions and 791 deletions
--- a/docs/docs_skeleton/docs/guides/evaluation/llm_math.ipynb
+++ b/docs/docs_skeleton/docs/guides/evaluation/llm_math.ipynb
@ -1,308 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "a4734146",
-   "metadata": {},
-   "source": [
-    "# LLM Math\n",
-    "\n",
-    "Evaluating chains that know how to do math."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "fdd7afae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Comment this out if you are NOT using tracing\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"LANGCHAIN_HANDLER\"] = \"langchain\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "ce05ffea",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d028a511cede4de2b845b9a9954d6bea",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset json/LangChainDatasets--llm-math to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a71c8e5a21dd4da5a20a354b544f7a58",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ae530ca624154a1a934075c47d1093a6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Downloading data:   0%|          | 0.00/631 [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "7a4968df05d84bc483aa2c5039aecafe",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split: 0 examples [00:00, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset json downloaded and prepared to /Users/harrisonchase/.cache/huggingface/datasets/LangChainDatasets___json/LangChainDatasets--llm-math-509b11d101165afa/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9a2caed96225410fb1cc0f8f155eb766",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"llm-math\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8a998d6f",
-   "metadata": {},
-   "source": [
-    "## Setting up a chain\n",
-    "Now we need to create some pipelines for doing math."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "7078f7f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import LLMMathChain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "2bd70c46",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = OpenAI()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "954c3270",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = LLMMathChain(llm=llm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "f252027e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "predictions = chain.apply(dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "c8af7041",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "numeric_output = [float(p[\"answer\"].strip().strip(\"Answer: \")) for p in predictions]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "cc09ffe4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "correct = [example[\"answer\"] == numeric_output[i] for i, example in enumerate(dataset)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "585244e4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1.0"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sum(correct) / len(correct)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "0d14ac78",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "input:  5\n",
-      "expected output : 5.0\n",
-      "prediction:  5.0\n",
-      "input:  5 + 3\n",
-      "expected output : 8.0\n",
-      "prediction:  8.0\n",
-      "input:  2^3.171\n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:    2 ^3.171 \n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:  two to the power of three point one hundred seventy one\n",
-      "expected output : 9.006708689094099\n",
-      "prediction:  9.006708689094099\n",
-      "input:  five + three squared minus 1\n",
-      "expected output : 13.0\n",
-      "prediction:  13.0\n",
-      "input:  2097 times 27.31\n",
-      "expected output : 57269.07\n",
-      "prediction:  57269.07\n",
-      "input:  two thousand ninety seven times twenty seven point thirty one\n",
-      "expected output : 57269.07\n",
-      "prediction:  57269.07\n",
-      "input:  209758 / 2714\n",
-      "expected output : 77.28739867354459\n",
-      "prediction:  77.28739867354459\n",
-      "input:  209758.857 divided by 2714.31\n",
-      "expected output : 77.27888745205964\n",
-      "prediction:  77.27888745205964\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i, example in enumerate(dataset):\n",
-    "    print(\"input: \", example[\"question\"])\n",
-    "    print(\"expected output :\", example[\"answer\"])\n",
-    "    print(\"prediction: \", numeric_output[i])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b9021ffd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
+++ b/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
@ -1,436 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Evaluating Agent Trajectories\n",
-    "\n",
-    "Good evaluation is key for quickly iterating on your agent's prompts and tools. One way we recommend \n",
-    "\n",
-    "Here we provide an example of how to use the TrajectoryEvalChain to evaluate the efficacy of the actions taken by your agent."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Setup\n",
-    "\n",
-    "Let's start by defining our agent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain import Wikipedia\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent, Tool\n",
-    "from langchain.agents import AgentType\n",
-    "from langchain.agents.react.base import DocstoreExplorer\n",
-    "from langchain.memory import ConversationBufferMemory\n",
-    "from langchain import LLMMathChain\n",
-    "from langchain.llms import OpenAI\n",
-    "\n",
-    "from langchain import SerpAPIWrapper\n",
-    "\n",
-    "docstore = DocstoreExplorer(Wikipedia())\n",
-    "\n",
-    "math_llm = OpenAI(temperature=0)\n",
-    "\n",
-    "llm_math_chain = LLMMathChain.from_llm(llm=math_llm, verbose=True)\n",
-    "\n",
-    "search = SerpAPIWrapper()\n",
-    "\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"Search\",\n",
-    "        func=docstore.search,\n",
-    "        description=\"useful for when you need to ask with search. Must call before lookup.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Lookup\",\n",
-    "        func=docstore.lookup,\n",
-    "        description=\"useful for when you need to ask with lookup. Only call after a successfull 'Search'.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Calculator\",\n",
-    "        func=llm_math_chain.run,\n",
-    "        description=\"useful for arithmetic. Expects strict numeric input, no words.\",\n",
-    "    ),\n",
-    "    Tool(\n",
-    "        name=\"Search-the-Web-SerpAPI\",\n",
-    "        func=search.run,\n",
-    "        description=\"useful for when you need to answer questions about current events\",\n",
-    "    ),\n",
-    "]\n",
-    "\n",
-    "memory = ConversationBufferMemory(\n",
-    "    memory_key=\"chat_history\", return_messages=True, output_key=\"output\"\n",
-    ")\n",
-    "\n",
-    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-0613\")\n",
-    "\n",
-    "agent = initialize_agent(\n",
-    "    tools,\n",
-    "    llm,\n",
-    "    agent=AgentType.OPENAI_FUNCTIONS,\n",
-    "    verbose=True,\n",
-    "    memory=memory,\n",
-    "    return_intermediate_steps=True,  # This is needed for the evaluation later\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Test the Agent\n",
-    "\n",
-    "Now let's try our agent out on some example queries."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `1040000 / (4/100)^3 / 1000000`\n",
-      "responded: {content}\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "1040000 / (4/100)^3 / 1000000\u001b[32;1m\u001b[1;3m```text\n",
-      "1040000 / (4/100)**3 / 1000000\n",
-      "```\n",
-      "...numexpr.evaluate(\"1040000 / (4/100)**3 / 1000000\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m16249.999999999998\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 16249.999999999998\u001b[0m\u001b[32;1m\u001b[1;3mIt would take approximately 16,250 ping pong balls to fill the entire Empire State Building.\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "query_one = (\n",
-    "    \"How many ping pong balls would it take to fill the entire Empire State Building?\"\n",
-    ")\n",
-    "\n",
-    "test_outputs_one = agent({\"input\": query_one}, return_only_outputs=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This looks alright.. Let's try it out on another query."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Search` with `length of the US from coast to coast`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3m\n",
-      "== Watercraft ==\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Search` with `distance from coast to coast of the US`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\u001b[36;1m\u001b[1;3mThe Oregon Coast is a coastal region of the U.S. state of Oregon. It is bordered by the Pacific Ocean to its west and the Oregon Coast Range to the east, and stretches approximately 362 miles (583 km) from the California state border in the south to the Columbia River in the north. The region is not a specific geological, environmental, or political entity, and includes the Columbia River Estuary.\n",
-      "The Oregon Beach Bill of 1967 allows free beach access to everyone.  In return for a pedestrian easement and relief from construction, the bill eliminates property taxes on private beach land and allows its owners to retain certain beach land rights.Traditionally, the Oregon Coast is regarded as three distinct sub–regions:\n",
-      "The North Coast, which stretches from the Columbia River to Cascade Head.\n",
-      "The Central Coast, which stretches from Cascade Head to Reedsport.\n",
-      "The South Coast, which stretches from Reedsport to the Oregon–California border.The largest city is Coos Bay, population 16,700 in Coos County on the South Coast. U.S. Route 101 is the primary highway from Brookings to Astoria and is known for its scenic overlooks of the Pacific Ocean. Over 80 state parks and recreation areas dot the Oregon Coast. However, only a few highways cross the Coast Range to the interior: US 30, US 26, OR 6, US 20, OR 18, OR 34, OR 126, OR 38, and OR 42.  OR 18 and US 20 are considered among the dangerous roads in the state.The Oregon Coast includes Clatsop County, Tillamook County, Lincoln County, western Lane County, western Douglas County, Coos County, and Curry County.\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `362 miles * 5280 feet`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "362 miles * 5280 feet\u001b[32;1m\u001b[1;3m```text\n",
-      "362 * 5280\n",
-      "```\n",
-      "...numexpr.evaluate(\"362 * 5280\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m1911360\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 1911360\u001b[0m\u001b[32;1m\u001b[1;3m\n",
-      "Invoking: `Calculator` with `1911360 feet / 1063 feet`\n",
-      "\n",
-      "\n",
-      "\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Entering new  chain...\u001b[0m\n",
-      "1911360 feet / 1063 feet\u001b[32;1m\u001b[1;3m```text\n",
-      "1911360 / 1063\n",
-      "```\n",
-      "...numexpr.evaluate(\"1911360 / 1063\")...\n",
-      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m1798.0809031044214\u001b[0m\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\u001b[38;5;200m\u001b[1;3mAnswer: 1798.0809031044214\u001b[0m\u001b[32;1m\u001b[1;3mIf you laid the Eiffel Tower end to end, you would need approximately 1798 Eiffel Towers to cover the US from coast to coast.\u001b[0m\n",
-      "\n",
-      "\u001b[1m> Finished chain.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "query_two = \"If you laid the Eiffel Tower end to end, how many would you need cover the US from coast to coast?\"\n",
-    "\n",
-    "test_outputs_two = agent({\"input\": query_two}, return_only_outputs=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This doesn't look so good. Let's try running some evaluation.\n",
-    "\n",
-    "## Evaluating the Agent\n",
-    "\n",
-    "Let's start by defining the TrajectoryEvalChain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.evaluation.agents import TrajectoryEvalChain\n",
-    "\n",
-    "# Define chain\n",
-    "eval_llm = ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n",
-    "eval_chain = TrajectoryEvalChain.from_llm(\n",
-    "    llm=eval_llm,  # Note: This must be a chat model\n",
-    "    agent_tools=agent.tools,\n",
-    "    return_reasoning=True,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's try evaluating the first query."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  1\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful because it is incorrect. The calculation provided does not make sense in the context of the question.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The AI language model does not use a logical sequence of tools. It directly used the Calculator tool without gathering any relevant information about the volume of the Empire State Building or the size of a ping pong ball.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the size of a ping pong ball before attempting any calculations.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model used only one step, which was not enough to answer the question correctly. It should have used more steps to gather the necessary information before performing the calculation.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools were not used to answer the question. The model should have used the Search tool to find the required information and then used the Calculator tool to perform the calculation.\n",
-      "\n",
-      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
-     ]
-    }
-   ],
-   "source": [
-    "question, steps, answer = (\n",
-    "    test_outputs_one[\"input\"],\n",
-    "    test_outputs_one[\"intermediate_steps\"],\n",
-    "    test_outputs_one[\"output\"],\n",
-    ")\n",
-    "\n",
-    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
-    "    input=test_outputs_one[\"input\"],\n",
-    "    output=test_outputs_one[\"output\"],\n",
-    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
-    ")\n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**That seems about right. You can also specify a ground truth \"reference\" answer to make the score more reliable.**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  1\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful, as it is incorrect. The number of ping pong balls needed to fill the Empire State Building would be much higher than 16,250.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The AI language model does not use a logical sequence of tools. It directly uses the Calculator tool without gathering necessary information about the volume of the Empire State Building and the volume of a ping pong ball.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the volume of a ping pong ball before using the Calculator tool.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model does not use too many steps, but it skips essential steps to answer the question correctly.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools are not used to answer the question. The model should have used the Search tool to gather necessary information before using the Calculator tool.\n",
-      "\n",
-      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
-    "    input=test_outputs_one[\"input\"],\n",
-    "    output=test_outputs_one[\"output\"],\n",
-    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
-    "    reference=(\n",
-    "        \"You need many more than 100,000 ping-pong balls in the empire state building.\"\n",
-    "    ),\n",
-    ")\n",
-    "\n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Let's try the second query. This time, use the async API. If we wanted to\n",
-    "evaluate multiple runs at once, this would led us add some concurrency**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Score from 1 to 5:  2\n",
-      "Reasoning:  i. Is the final answer helpful?\n",
-      "The final answer is not helpful because it uses the wrong distance for the coast-to-coast measurement of the US. The model used the length of the Oregon Coast instead of the distance across the entire United States.\n",
-      "\n",
-      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "The sequence of tools is logical, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
-      "\n",
-      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model uses the tools in a helpful way, but the information obtained from the Search tool is incorrect. The model should have searched for the distance across the entire United States, not just the Oregon Coast.\n",
-      "\n",
-      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "The AI language model does not use too many steps to answer the question. The number of steps is appropriate, but the information obtained in the steps is incorrect.\n",
-      "\n",
-      "v. Are the appropriate tools used to answer the question?\n",
-      "The appropriate tools are used, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
-      "\n",
-      "Given the incorrect information obtained from the Search tool and the resulting incorrect final answer, we give the model a score of 2.\n"
-     ]
-    }
-   ],
-   "source": [
-    "evaluation = await eval_chain.aevaluate_agent_trajectory(\n",
-    "    input=test_outputs_two[\"input\"],\n",
-    "    output=test_outputs_two[\"output\"],\n",
-    "    agent_trajectory=test_outputs_two[\"intermediate_steps\"],\n",
-    ")\n",
-    "\n",
-    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
-    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Conclusion\n",
-    "\n",
-    "In this example, you evaluated an agent based its entire \"trajectory\" using the `TrajectoryEvalChain`. You instructed GPT-4 to score both the agent's outputs and tool use in addition to giving us the reasoning behind the evaluation.\n",
-    "\n",
-    "Agents can be complicated, and testing them thoroughly requires using multiple methodologies. Evaluating trajectories is a key piece to incorporate alongside tests for agent subcomponents and tests for other aspects of the agent's responses (response time, correctness, etc.) "
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "06ba49dd587e86cdcfee66b9ffe769e1e94f0e368e54c2d6c866e38e33c0d9b1"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/docs/extras/modules/evaluation/string/criteria_eval_chain.ipynb
+++ b/docs/extras/modules/evaluation/string/criteria_eval_chain.ipynb
@ -12,7 +12,7 @@
    "The `criteria` evaluator is a convenient way to predict whether an LLM or Chain's output complies with a set of criteria, so long as you can\n",
    "properly define those criteria.\n",
    "\n",
-    "For more details, check out the reference docs for the [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) on the class definition\n",
+    "For more details, check out the reference docs for the [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain)'s class definition.\n",
    "\n",
    "### Without References\n",
    "\n",
--- a/docs/extras/modules/evaluation/string/embedding_distance.ipynb
+++ b/docs/extras/modules/evaluation/string/embedding_distance.ipynb
@ -8,12 +8,12 @@
   "source": [
    "# Embedding Distance\n",
    "\n",
-    "To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embeding_distance` evaulator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
+    "To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
    "\n",
    "\n",
    "**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
    "\n",
-    "Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
+    "Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
   ]
  },
  {
@ -26,7 +26,7 @@
   "source": [
    "from langchain.evaluation import load_evaluator\n",
    "\n",
-    "evaluator = load_evaluator(\"pairwise_embedding_distance\")"
+    "evaluator = load_evaluator(\"embedding_distance\")"
   ]
  },
  {
@ -48,7 +48,7 @@
    }
   ],
   "source": [
-    "evaluator.evaluate_string_pairs(prediction=\"I shall go\", prediction_b=\"I shan't go\")"
+    "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
   ]
  },
  {
@ -70,7 +70,7 @@
    }
   ],
   "source": [
-    "evaluator.evaluate_string_pairs(prediction=\"I shall go\", prediction_b=\"I will go\")"
+    "evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
   ]
  },
  {
@ -118,8 +118,9 @@
   },
   "outputs": [],
   "source": [
+    "# You can load by enum or by raw python string\n",
    "evaluator = load_evaluator(\n",
-    "    \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
+    "    \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
    ")"
   ]
  },
@ -134,7 +135,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "metadata": {
    "tags": []
   },
@ -143,12 +144,12 @@
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "\n",
    "embedding_model = HuggingFaceEmbeddings()\n",
-    "hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
+    "hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
   "metadata": {
    "tags": []
   },
@ -159,18 +160,18 @@
       "{'score': 0.5486443280477362}"
      ]
     },
-     "execution_count": 10,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "hf_evaluator.evaluate_string_pairs(prediction=\"I shall go\", prediction_b=\"I shan't go\")"
+    "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
   "metadata": {
    "tags": []
   },
@ -181,20 +182,20 @@
       "{'score': 0.21018880025138598}"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "hf_evaluator.evaluate_string_pairs(prediction=\"I shall go\", prediction_b=\"I will go\")"
+    "hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
+    "<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
   ]
  }
 ],
--- a/docs/extras/modules/evaluation/string/qa.ipynb
+++ b/docs/extras/modules/evaluation/string/qa.ipynb
@ -2,12 +2,16 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "c701fcaf-e5dc-42a2-b8a7-027d13ff465f",
-   "metadata": {},
+   "id": "d63696a8-d035-4cf7-9605-c3210f0b551d",
+   "metadata": {
+    "tags": []
+   },
   "source": [
    "# QA Correctness\n",
    "\n",
-    "The QAEvalChain compares a question-answering model's response to a reference response.\n"
+    "When thinking about a QA system, one of the most important questions to ask is whether the final generated result is correct. The `\"qa\"` evaluator compares a question-answering model's response to a reference answer to provide this level of information. If you are able to annotate a test dataset, this evaluator will be useful.\n",
+    "\n",
+    "For more details, check out the reference docs for the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain)'s class definition."
   ]
  },
  {
@ -20,11 +24,12 @@
   "outputs": [],
   "source": [
    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.evaluation import QAEvalChain\n",
+    "from langchain.evaluation import load_evaluator\n",
    "\n",
    "llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
-    "criterion = \"conciseness\"\n",
-    "eval_chain = QAEvalChain.from_llm(llm=llm)"
+    "\n",
+    "# Note: the eval_llm is optional. A gpt-4 model will be provided by default if not specified\n",
+    "evaluator = load_evaluator(\"qa\", eval_llm=llm)"
   ]
  },
  {
@ -47,10 +52,10 @@
    }
   ],
   "source": [
-    "eval_chain.evaluate_strings(\n",
+    "evaluator.evaluate_strings(\n",
    "    input=\"What's last quarter's sales numbers?\",\n",
    "    prediction=\"Last quarter we sold 600,000 total units of product.\",\n",
-    "    reference=\"Last quarter we sold 100,000 units of product A, 200,000 units of product B, and 300,000 units of product C.\",\n",
+    "    reference=\"Last quarter we sold 100,000 units of product A, 210,000 units of product B, and 300,000 units of product C.\",\n",
    ")"
   ]
  },
@ -61,7 +66,7 @@
   "source": [
    "## SQL Correctness\n",
    "\n",
-    "You can use an LLM to check the equivalence of a SQL query against a reference SQL query. using the sql prompt."
+    "You can use an LLM to check the equivalence of a SQL query against a reference SQL query using the sql prompt."
   ]
  },
  {
@ -75,7 +80,7 @@
   "source": [
    "from langchain.evaluation.qa.eval_prompt import SQL_PROMPT\n",
    "\n",
-    "eval_chain = QAEvalChain.from_llm(llm=llm, prompt=SQL_PROMPT)"
+    "eval_chain = load_evaluator(\"qa\", eval_llm=llm, prompt=SQL_PROMPT)"
   ]
  },
  {
@ -89,7 +94,7 @@
    {
     "data": {
      "text/plain": [
-       "{'reasoning': 'The expert answer and the submission are very similar in their approach to solving the problem. Both queries are trying to calculate the sum of sales from the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\\n\\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\\n\\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of sales from the last quarter.\\n\\nCORRECT',\n",
+       "{'reasoning': 'The expert answer and the submission are very similar in their structure and logic. Both queries are trying to calculate the sum of sales amounts for the last quarter. They both use the SUM function to add up the sale_amount from the sales table. They also both use the same WHERE clause to filter the sales data to only include sales from the last quarter. The WHERE clause uses the DATEADD function to subtract 1 quarter from the current date (GETDATE()) and only includes sales where the sale_date is greater than or equal to this date and less than the current date.\\n\\nThe main difference between the two queries is that the expert answer uses a subquery to first select the sale_amount from the sales table with the appropriate date filter, and then sums these amounts in the outer query. The submission, on the other hand, does not use a subquery and instead sums the sale_amount directly in the main query with the same date filter.\\n\\nHowever, this difference does not affect the result of the query. Both queries will return the same result, which is the sum of the sales amounts for the last quarter.\\n\\nCORRECT',\n",
       " 'value': 'CORRECT',\n",
       " 'score': 1}"
      ]
@ -123,7 +128,7 @@
   "source": [
    "## Using Context\n",
    "\n",
-    "Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the ContextQAEvalChain."
+    "Sometimes, reference labels aren't all available, but you have additional knowledge as context from a retrieval system. Often there may be additional information that isn't available to the model you want to evaluate. For this type of scenario, you can use the [ContextQAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.ContextQAEvalChain.html#langchain.evaluation.qa.eval_chain.ContextQAEvalChain)."
   ]
  },
  {
@ -146,9 +151,7 @@
    }
   ],
   "source": [
-    "from langchain.evaluation import ContextQAEvalChain\n",
-    "\n",
-    "eval_chain = ContextQAEvalChain.from_llm(llm=llm)\n",
+    "eval_chain = load_evaluator(\"context_qa\", eval_llm=llm)\n",
    "\n",
    "eval_chain.evaluate_strings(\n",
    "    input=\"Who won the NFC championship game in 2023?\",\n",
@ -165,7 +168,7 @@
    "## CoT With Context\n",
    "\n",
    "The same prompt strategies such as chain of thought can be used to make the evaluation results more reliable.\n",
-    "The `CotQAEvalChain`'s default prompt instructs the model to do this."
+    "The [CotQAEvalChain's](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.CotQAEvalChain.html#langchain.evaluation.qa.eval_chain.CotQAEvalChain) default prompt instructs the model to do this."
   ]
  },
  {
@ -179,8 +182,8 @@
    {
     "data": {
      "text/plain": [
-       "{'reasoning': 'The context states that the Philadelphia Eagles won the NFC championship game in 2023. The student\\'s answer, \"Eagles,\" matches the team that won according to the context. Therefore, the student\\'s answer is correct.',\n",
-       " 'value': 'CORRECT',\n",
+       "{'reasoning': 'The student\\'s answer is \"Eagles\". The context states that the Philadelphia Eagles won the NFC championship game in 2023. Therefore, the student\\'s answer matches the information provided in the context.',\n",
+       " 'value': 'GRADE: CORRECT',\n",
       " 'score': 1}"
      ]
     },
@ -190,9 +193,7 @@
    }
   ],
   "source": [
-    "from langchain.evaluation import CotQAEvalChain\n",
-    "\n",
-    "eval_chain = CotQAEvalChain.from_llm(llm=llm)\n",
+    "eval_chain = load_evaluator(\"cot_qa\", eval_llm=llm)\n",
    "\n",
    "eval_chain.evaluate_strings(\n",
    "    input=\"Who won the NFC championship game in 2023?\",\n",
--- a/docs/extras/modules/evaluation/trajectory/custom.ipynb
+++ b/docs/extras/modules/evaluation/trajectory/custom.ipynb
@ -7,7 +7,7 @@
   "source": [
    "# Custom Trajectory Evaluator\n",
    "\n",
-    "You can make your own custom trajectory evaluators by inheriting from the `AgentTrajectoryEvaluator` class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
+    "You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
    "\n",
    "\n",
    "In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
@ -67,7 +67,9 @@
   "id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
   "metadata": {},
   "source": [
-    "The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary."
+    "The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary.\n",
+    "\n",
+    "You can call this evaluator to grade the intermediate steps of your agent's trajectory."
   ]
  },
  {
@ -107,6 +109,12 @@
    "    ],\n",
    ")"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "77353528-723e-4075-939e-aebdb17c1e4f",
+   "metadata": {},
+   "source": []
  }
 ],
 "metadata": {
--- a/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb
+++ b/docs/extras/modules/evaluation/trajectory/trajectory_eval.ipynb
@ -100,11 +100,13 @@
  {
   "cell_type": "markdown",
   "id": "2df34eed-45a5-4f91-88d3-9aa55f28391a",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "source": [
    "## Evaluate Trajectory\n",
    "\n",
-    "Pass the input, trajectory, and output to the `evaluate_agent_trajectory` function."
+    "Pass the input, trajectory, and pass to the [evaluate_agent_trajectory](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator.evaluate_agent_trajectory) method."
   ]
  },
  {
@ -149,7 +151,7 @@
   "source": [
    "## Configuring the Evaluation LLM\n",
    "\n",
-    "If you don't select an LLM to use for evaluation, the `load_evaluator` function will use `gpt-4` to power the evaluation chain. You can select any chat model for the agent trajectory evaluator as below."
+    "If you don't select an LLM to use for evaluation, the [load_evaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.loading.load_evaluator.html#langchain.evaluation.loading.load_evaluator) function will use `gpt-4` to power the evaluation chain. You can select any chat model for the agent trajectory evaluator as below."
   ]
  },
  {
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@ -1,6 +1,7 @@
 """LLM Chain specifically for evaluating question answering."""
 from __future__ import annotations

+import re
 from typing import Any, List, Optional, Sequence

 from pydantic import Extra
@ -14,6 +15,16 @@ from langchain.schema import RUN_KEY
 from langchain.schema.language_model import BaseLanguageModel


+def _get_score(verdict: str) -> Optional[int]:
+    match = re.search(r"(?i)(?:grade:\s*)?(correct|incorrect)", verdict)
+    if match:
+        if match.group(1).upper() == "CORRECT":
+            return 1
+        elif match.group(1).upper() == "INCORRECT":
+            return 0
+    return None
+
+
 def _parse_string_eval_output(text: str) -> dict:
    """Parse the output text.

@ -30,11 +41,7 @@ def _parse_string_eval_output(text: str) -> dict:
    else:
        reasoning, verdict = splits
        reasoning = reasoning.strip()
-    score = (
-        1
-        if verdict.upper() == "CORRECT"
-        else (0 if verdict.upper() == "INCORRECT" else None)
-    )
+    score = _get_score(verdict)
    return {
        "reasoning": reasoning,
        "value": verdict,