Clean up agent trajectory interface (#6799)

- Enable reference - Enable not specifying tools at the start - Add methods with keywords
1 year ago · d7dbf4aefe
parent cc60fed3be
commit d7dbf4aefe
5 changed files with 621 additions and 110 deletions
--- a/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
+++ b/docs/extras/guides/evaluation/generic_agent_evaluation.ipynb
@ -4,9 +4,11 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Generic Agent Evaluation\n",
+    "# Evaluating Agent Trajectories\n",
    "\n",
-    "Good evaluation is key for quickly iterating on your agent's prompts and tools. Here we provide an example of how to use the TrajectoryEvalChain to evaluate your agent."
+    "Good evaluation is key for quickly iterating on your agent's prompts and tools. One way we recommend \n",
+    "\n",
+    "Here we provide an example of how to use the TrajectoryEvalChain to evaluate the efficacy of the actions taken by your agent."
   ]
  },
  {
@ -21,7 +23,9 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [],
   "source": [
    "from langchain import Wikipedia\n",
@ -39,7 +43,7 @@
    "\n",
    "math_llm = OpenAI(temperature=0)\n",
    "\n",
-    "llm_math_chain = LLMMathChain(llm=math_llm, verbose=True)\n",
+    "llm_math_chain = LLMMathChain.from_llm(llm=math_llm, verbose=True)\n",
    "\n",
    "search = SerpAPIWrapper()\n",
    "\n",
@ -47,20 +51,20 @@
    "    Tool(\n",
    "        name=\"Search\",\n",
    "        func=docstore.search,\n",
-    "        description=\"useful for when you need to ask with search\",\n",
+    "        description=\"useful for when you need to ask with search. Must call before lookup.\",\n",
    "    ),\n",
    "    Tool(\n",
    "        name=\"Lookup\",\n",
    "        func=docstore.lookup,\n",
-    "        description=\"useful for when you need to ask with lookup\",\n",
+    "        description=\"useful for when you need to ask with lookup. Only call after a successfull 'Search'.\",\n",
    "    ),\n",
    "    Tool(\n",
    "        name=\"Calculator\",\n",
    "        func=llm_math_chain.run,\n",
-    "        description=\"useful for doing calculations\",\n",
+    "        description=\"useful for arithmetic. Expects strict numeric input, no words.\",\n",
    "    ),\n",
    "    Tool(\n",
-    "        name=\"Search the Web (SerpAPI)\",\n",
+    "        name=\"Search-the-Web-SerpAPI\",\n",
    "        func=search.run,\n",
    "        description=\"useful for when you need to answer questions about current events\",\n",
    "    ),\n",
@ -70,12 +74,12 @@
    "    memory_key=\"chat_history\", return_messages=True, output_key=\"output\"\n",
    ")\n",
    "\n",
-    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\")\n",
+    "llm = ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo-0613\")\n",
    "\n",
    "agent = initialize_agent(\n",
    "    tools,\n",
    "    llm,\n",
-    "    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n",
+    "    agent=AgentType.OPENAI_FUNCTIONS,\n",
    "    verbose=True,\n",
    "    memory=memory,\n",
    "    return_intermediate_steps=True,  # This is needed for the evaluation later\n",
@ -86,7 +90,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Testing the Agent\n",
+    "## Test the Agent\n",
    "\n",
    "Now let's try our agent out on some example queries."
   ]
@ -94,7 +98,9 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "name": "stdout",
@ -102,16 +108,22 @@
     "text": [
      "\n",
      "\n",
-      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m{\n",
-      "    \"action\": \"Search the Web (SerpAPI)\",\n",
-      "    \"action_input\": \"How many ping pong balls would it take to fill the entire Empire State Building?\"\n",
-      "}\u001b[0m\n",
-      "Observation: \u001b[31;1m\u001b[1;3m12.8 billion. The volume of the Empire State Building Googles in at around 37 million ft³. A golf ball comes in at about 2.5 in³.\u001b[0m\n",
-      "Thought:\u001b[32;1m\u001b[1;3m{\n",
-      "    \"action\": \"Final Answer\",\n",
-      "    \"action_input\": \"It would take approximately 12.8 billion ping pong balls to fill the entire Empire State Building.\"\n",
-      "}\u001b[0m\n",
+      "\u001b[1m> Entering new  chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `Calculator` with `1040000 / (4/100)^3 / 1000000`\n",
+      "responded: {content}\n",
+      "\n",
+      "\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Entering new  chain...\u001b[0m\n",
+      "1040000 / (4/100)^3 / 1000000\u001b[32;1m\u001b[1;3m```text\n",
+      "1040000 / (4/100)**3 / 1000000\n",
+      "```\n",
+      "...numexpr.evaluate(\"1040000 / (4/100)**3 / 1000000\")...\n",
+      "\u001b[0m\n",
+      "Answer: \u001b[33;1m\u001b[1;3m16249.999999999998\u001b[0m\n",
+      "\u001b[1m> Finished chain.\u001b[0m\n",
+      "\u001b[38;5;200m\u001b[1;3mAnswer: 16249.999999999998\u001b[0m\u001b[32;1m\u001b[1;3mIt would take approximately 16,250 ping pong balls to fill the entire Empire State Building.\u001b[0m\n",
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n"
     ]
@ -129,13 +141,15 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This looks good! Let's try it out on another query."
+    "This looks alright.. Let's try it out on another query."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "name": "stdout",
@ -143,43 +157,49 @@
     "text": [
      "\n",
      "\n",
-      "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
-      "\u001b[32;1m\u001b[1;3m{\n",
-      "    \"action\": \"Calculator\",\n",
-      "    \"action_input\": \"The length of the Eiffel Tower is 324 meters. The distance from coast to coast in the US is approximately 4,828 kilometers. First, we need to convert 4,828 kilometers to meters, which gives us 4,828,000 meters. To find out how many Eiffel Towers we need, we can divide 4,828,000 by 324. This gives us approximately 14,876 Eiffel Towers.\"\n",
-      "}\u001b[0m\n",
+      "\u001b[1m> Entering new  chain...\u001b[0m\n",
+      "\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `Search` with `length of the US from coast to coast`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3m\n",
+      "== Watercraft ==\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `Search` with `distance from coast to coast of the US`\n",
+      "\n",
+      "\n",
+      "\u001b[0m\u001b[36;1m\u001b[1;3mThe Oregon Coast is a coastal region of the U.S. state of Oregon. It is bordered by the Pacific Ocean to its west and the Oregon Coast Range to the east, and stretches approximately 362 miles (583 km) from the California state border in the south to the Columbia River in the north. The region is not a specific geological, environmental, or political entity, and includes the Columbia River Estuary.\n",
+      "The Oregon Beach Bill of 1967 allows free beach access to everyone.  In return for a pedestrian easement and relief from construction, the bill eliminates property taxes on private beach land and allows its owners to retain certain beach land rights.Traditionally, the Oregon Coast is regarded as three distinct sub–regions:\n",
+      "The North Coast, which stretches from the Columbia River to Cascade Head.\n",
+      "The Central Coast, which stretches from Cascade Head to Reedsport.\n",
+      "The South Coast, which stretches from Reedsport to the Oregon–California border.The largest city is Coos Bay, population 16,700 in Coos County on the South Coast. U.S. Route 101 is the primary highway from Brookings to Astoria and is known for its scenic overlooks of the Pacific Ocean. Over 80 state parks and recreation areas dot the Oregon Coast. However, only a few highways cross the Coast Range to the interior: US 30, US 26, OR 6, US 20, OR 18, OR 34, OR 126, OR 38, and OR 42.  OR 18 and US 20 are considered among the dangerous roads in the state.The Oregon Coast includes Clatsop County, Tillamook County, Lincoln County, western Lane County, western Douglas County, Coos County, and Curry County.\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `Calculator` with `362 miles * 5280 feet`\n",
      "\n",
-      "\u001b[1m> Entering new LLMMathChain chain...\u001b[0m\n",
-      "The length of the Eiffel Tower is 324 meters. The distance from coast to coast in the US is approximately 4,828 kilometers. First, we need to convert 4,828 kilometers to meters, which gives us 4,828,000 meters. To find out how many Eiffel Towers we need, we can divide 4,828,000 by 324. This gives us approximately 14,876 Eiffel Towers.\u001b[32;1m\u001b[1;3m\n",
-      "```text\n",
-      "4828000 / 324\n",
+      "\n",
+      "\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Entering new  chain...\u001b[0m\n",
+      "362 miles * 5280 feet\u001b[32;1m\u001b[1;3m```text\n",
+      "362 * 5280\n",
      "```\n",
-      "...numexpr.evaluate(\"4828000 / 324\")...\n",
+      "...numexpr.evaluate(\"362 * 5280\")...\n",
      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m14901.234567901234\u001b[0m\n",
+      "Answer: \u001b[33;1m\u001b[1;3m1911360\u001b[0m\n",
      "\u001b[1m> Finished chain.\u001b[0m\n",
+      "\u001b[38;5;200m\u001b[1;3mAnswer: 1911360\u001b[0m\u001b[32;1m\u001b[1;3m\n",
+      "Invoking: `Calculator` with `1911360 feet / 1063 feet`\n",
      "\n",
-      "Observation: \u001b[38;5;200m\u001b[1;3mAnswer: 14901.234567901234\u001b[0m\n",
-      "Thought:\u001b[32;1m\u001b[1;3m{\n",
-      "    \"action\": \"Calculator\",\n",
-      "    \"action_input\": \"The length of the Eiffel Tower is 324 meters. The distance from coast to coast in the US is approximately 4,828 kilometers. First, we need to convert 4,828 kilometers to meters, which gives us 4,828,000 meters. To find out how many Eiffel Towers we need, we can divide 4,828,000 by 324. This gives us approximately 14,901 Eiffel Towers.\"\n",
-      "}\u001b[0m\n",
      "\n",
-      "\u001b[1m> Entering new LLMMathChain chain...\u001b[0m\n",
-      "The length of the Eiffel Tower is 324 meters. The distance from coast to coast in the US is approximately 4,828 kilometers. First, we need to convert 4,828 kilometers to meters, which gives us 4,828,000 meters. To find out how many Eiffel Towers we need, we can divide 4,828,000 by 324. This gives us approximately 14,901 Eiffel Towers.\u001b[32;1m\u001b[1;3m\n",
-      "```text\n",
-      "4828000 / 324\n",
+      "\u001b[0m\n",
+      "\n",
+      "\u001b[1m> Entering new  chain...\u001b[0m\n",
+      "1911360 feet / 1063 feet\u001b[32;1m\u001b[1;3m```text\n",
+      "1911360 / 1063\n",
      "```\n",
-      "...numexpr.evaluate(\"4828000 / 324\")...\n",
+      "...numexpr.evaluate(\"1911360 / 1063\")...\n",
      "\u001b[0m\n",
-      "Answer: \u001b[33;1m\u001b[1;3m14901.234567901234\u001b[0m\n",
+      "Answer: \u001b[33;1m\u001b[1;3m1798.0809031044214\u001b[0m\n",
      "\u001b[1m> Finished chain.\u001b[0m\n",
-      "\n",
-      "Observation: \u001b[38;5;200m\u001b[1;3mAnswer: 14901.234567901234\u001b[0m\n",
-      "Thought:\u001b[32;1m\u001b[1;3m{\n",
-      "    \"action\": \"Final Answer\",\n",
-      "    \"action_input\": \"If you laid the Eiffel Tower end to end, you would need approximately 14,901 Eiffel Towers to cover the US from coast to coast.\"\n",
-      "}\u001b[0m\n",
+      "\u001b[38;5;200m\u001b[1;3mAnswer: 1798.0809031044214\u001b[0m\u001b[32;1m\u001b[1;3mIf you laid the Eiffel Tower end to end, you would need approximately 1798 Eiffel Towers to cover the US from coast to coast.\u001b[0m\n",
      "\n",
      "\u001b[1m> Finished chain.\u001b[0m\n"
     ]
@ -205,16 +225,17 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [],
   "source": [
    "from langchain.evaluation.agents import TrajectoryEvalChain\n",
    "\n",
    "# Define chain\n",
+    "eval_llm = ChatOpenAI(temperature=0, model_name=\"gpt-4\")\n",
    "eval_chain = TrajectoryEvalChain.from_llm(\n",
-    "    llm=ChatOpenAI(\n",
-    "        temperature=0, model_name=\"gpt-4\"\n",
-    "    ),  # Note: This must be a ChatOpenAI model\n",
+    "    llm=eval_llm,  # Note: This must be a chat model\n",
    "    agent_tools=agent.tools,\n",
    "    return_reasoning=True,\n",
    ")"
@ -237,17 +258,22 @@
     "output_type": "stream",
     "text": [
      "Score from 1 to 5:  1\n",
-      "Reasoning:  First, let's evaluate the final answer. The final answer is incorrect because it uses the volume of golf balls instead of ping pong balls. The answer is not helpful.\n",
+      "Reasoning:  i. Is the final answer helpful?\n",
+      "The final answer is not helpful because it is incorrect. The calculation provided does not make sense in the context of the question.\n",
      "\n",
-      "Second, does the model use a logical sequence of tools to answer the question? The model only used one tool, which was the Search the Web (SerpAPI). It did not use the Calculator tool to calculate the correct volume of ping pong balls.\n",
+      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
+      "The AI language model does not use a logical sequence of tools. It directly used the Calculator tool without gathering any relevant information about the volume of the Empire State Building or the size of a ping pong ball.\n",
      "\n",
-      "Third, does the AI language model use the tools in a helpful way? The model used the Search the Web (SerpAPI) tool, but the output was not helpful because it provided information about golf balls instead of ping pong balls.\n",
+      "iii. Does the AI language model use the tools in a helpful way?\n",
+      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the size of a ping pong ball before attempting any calculations.\n",
      "\n",
-      "Fourth, does the AI language model use too many steps to answer the question? The model used only one step, which is not too many. However, it should have used more steps to provide a correct answer.\n",
+      "iv. Does the AI language model use too many steps to answer the question?\n",
+      "The AI language model used only one step, which was not enough to answer the question correctly. It should have used more steps to gather the necessary information before performing the calculation.\n",
      "\n",
-      "Fifth, are the appropriate tools used to answer the question? The model should have used the Search tool to find the volume of the Empire State Building and the volume of a ping pong ball. Then, it should have used the Calculator tool to calculate the number of ping pong balls needed to fill the building.\n",
+      "v. Are the appropriate tools used to answer the question?\n",
+      "The appropriate tools were not used to answer the question. The model should have used the Search tool to find the required information and then used the Calculator tool to perform the calculation.\n",
      "\n",
-      "Judgment: Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
+      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
     ]
    }
   ],
@ -258,12 +284,10 @@
    "    test_outputs_one[\"output\"],\n",
    ")\n",
    "\n",
-    "evaluation = eval_chain(\n",
-    "    inputs={\n",
-    "        \"question\": question,\n",
-    "        \"answer\": answer,\n",
-    "        \"agent_trajectory\": eval_chain.get_agent_trajectory(steps),\n",
-    "    },\n",
+    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
+    "    input=test_outputs_one[\"input\"],\n",
+    "    output=test_outputs_one[\"output\"],\n",
+    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
    ")\n",
    "\n",
    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
@ -274,51 +298,97 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "That seems about right. Let's try the second query."
+    "**That seems about right. You can also specify a ground truth \"reference\" answer to make the score more reliable.**"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
+   "execution_count": 13,
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Score from 1 to 5:  3\n",
+      "Score from 1 to 5:  1\n",
      "Reasoning:  i. Is the final answer helpful?\n",
-      "Yes, the final answer is helpful as it provides an approximate number of Eiffel Towers needed to cover the US from coast to coast.\n",
+      "The final answer is not helpful, as it is incorrect. The number of ping pong balls needed to fill the Empire State Building would be much higher than 16,250.\n",
      "\n",
      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
-      "No, the AI language model does not use a logical sequence of tools. It directly uses the Calculator tool without first using the Search or Lookup tools to find the necessary information (length of the Eiffel Tower and distance from coast to coast in the US).\n",
+      "The AI language model does not use a logical sequence of tools. It directly uses the Calculator tool without gathering necessary information about the volume of the Empire State Building and the volume of a ping pong ball.\n",
      "\n",
      "iii. Does the AI language model use the tools in a helpful way?\n",
-      "The AI language model uses the Calculator tool in a helpful way to perform the calculation, but it should have used the Search or Lookup tools first to find the required information.\n",
+      "The AI language model does not use the tools in a helpful way. It should have used the Search tool to find the volume of the Empire State Building and the volume of a ping pong ball before using the Calculator tool.\n",
      "\n",
      "iv. Does the AI language model use too many steps to answer the question?\n",
-      "No, the AI language model does not use too many steps. However, it repeats the same step twice, which is unnecessary.\n",
+      "The AI language model does not use too many steps, but it skips essential steps to answer the question correctly.\n",
      "\n",
      "v. Are the appropriate tools used to answer the question?\n",
-      "Not entirely. The AI language model should have used the Search or Lookup tools to find the required information before using the Calculator tool.\n",
+      "The appropriate tools are not used to answer the question. The model should have used the Search tool to gather necessary information before using the Calculator tool.\n",
      "\n",
-      "Given the above evaluation, the AI language model's performance can be scored as follows:\n"
+      "Given the incorrect final answer and the inappropriate use of tools, we give the model a score of 1.\n"
     ]
    }
   ],
   "source": [
-    "question, steps, answer = (\n",
-    "    test_outputs_two[\"input\"],\n",
-    "    test_outputs_two[\"intermediate_steps\"],\n",
-    "    test_outputs_two[\"output\"],\n",
+    "evaluation = eval_chain.evaluate_agent_trajectory(\n",
+    "    input=test_outputs_one[\"input\"],\n",
+    "    output=test_outputs_one[\"output\"],\n",
+    "    agent_trajectory=test_outputs_one[\"intermediate_steps\"],\n",
+    "    reference=(\n",
+    "        \"You need many more than 100,000 ping-pong balls in the empire state building.\"\n",
+    "    )\n",
    ")\n",
+    "    \n",
    "\n",
-    "evaluation = eval_chain(\n",
-    "    inputs={\n",
-    "        \"question\": question,\n",
-    "        \"answer\": answer,\n",
-    "        \"agent_trajectory\": eval_chain.get_agent_trajectory(steps),\n",
-    "    },\n",
+    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
+    "print(\"Reasoning: \", evaluation[\"reasoning\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Let's try the second query. This time, use the async API. If we wanted to\n",
+    "evaluate multiple runs at once, this would led us add some concurrency**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Score from 1 to 5:  2\n",
+      "Reasoning:  i. Is the final answer helpful?\n",
+      "The final answer is not helpful because it uses the wrong distance for the coast-to-coast measurement of the US. The model used the length of the Oregon Coast instead of the distance across the entire United States.\n",
+      "\n",
+      "ii. Does the AI language use a logical sequence of tools to answer the question?\n",
+      "The sequence of tools is logical, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
+      "\n",
+      "iii. Does the AI language model use the tools in a helpful way?\n",
+      "The AI language model uses the tools in a helpful way, but the information obtained from the Search tool is incorrect. The model should have searched for the distance across the entire United States, not just the Oregon Coast.\n",
+      "\n",
+      "iv. Does the AI language model use too many steps to answer the question?\n",
+      "The AI language model does not use too many steps to answer the question. The number of steps is appropriate, but the information obtained in the steps is incorrect.\n",
+      "\n",
+      "v. Are the appropriate tools used to answer the question?\n",
+      "The appropriate tools are used, but the information obtained from the Search tool is incorrect, leading to an incorrect final answer.\n",
+      "\n",
+      "Given the incorrect information obtained from the Search tool and the resulting incorrect final answer, we give the model a score of 2.\n"
+     ]
+    }
+   ],
+   "source": [
+    "evaluation = await eval_chain.aevaluate_agent_trajectory(\n",
+    "    input=test_outputs_two[\"input\"],\n",
+    "    output=test_outputs_two[\"output\"],\n",
+    "    agent_trajectory=test_outputs_two[\"intermediate_steps\"],\n",
    ")\n",
    "\n",
    "print(\"Score from 1 to 5: \", evaluation[\"score\"])\n",
@ -329,7 +399,11 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "That also sounds about right. In conclusion, the TrajectoryEvalChain allows us to use GPT-4 to score both our agent's outputs and tool use in addition to giving us the reasoning behind the evaluation."
+    "## Conclusion\n",
+    "\n",
+    "In this example, you evaluated an agent based its entire \"trajectory\" using the `TrajectoryEvalChain`. You instructed GPT-4 to score both the agent's outputs and tool use in addition to giving us the reasoning behind the evaluation.\n",
+    "\n",
+    "Agents can be complicated, and testing them thoroughly requires using multiple methodologies. Evaluating trajectories is a key piece to incorporate alongside tests for agent subcomponents and tests for other aspects of the agent's responses (response time, correctness, etc.) "
   ]
  }
 ],
@ -349,7 +423,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.11.3"
  },
  "vscode": {
   "interpreter": {
@ -358,5 +432,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@ -1,11 +1,26 @@
-"""A chain for evaluating ReAct style agents."""
+"""A chain for evaluating ReAct style agents.
+
+This chain is used to evaluate ReAct style agents by reasoning about
+the sequence of actions taken and their outcomes. It uses a language model
+chain (LLMChain) to generate the reasoning and scores.
+"""
+
 from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

-from langchain.callbacks.manager import CallbackManagerForChainRun
+from pydantic import Field
+
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
 from langchain.chains.base import Chain
 from langchain.chains.llm import LLMChain
-from langchain.chat_models import ChatOpenAI
-from langchain.evaluation.agents.trajectory_eval_prompt import EVAL_CHAT_PROMPT
+from langchain.chat_models.base import BaseChatModel
+from langchain.evaluation.agents.trajectory_eval_prompt import (
+    EVAL_CHAT_PROMPT,
+    TOOL_FREE_EVAL_CHAT_PROMPT,
+)
 from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
 from langchain.tools.base import BaseTool

@ -21,6 +36,18 @@ class TrajectoryOutputParser(BaseOutputParser):
        return "agent_trajectory"

    def parse(self, text: str) -> TrajectoryEval:
+        """Parse the output text and extract the score and reasoning.
+
+        Args:
+            text (str): The output text to parse.
+
+        Returns:
+            TrajectoryEval: A named tuple containing the score and reasoning.
+
+        Raises:
+            OutputParserException: If the score is not found in the output text or
+                if the score is not a digit in the range 1-5.
+        """
        if "Score:" not in text:
            raise OutputParserException(
                f"Could not find score in model eval output: {text}"
@ -43,13 +70,68 @@ class TrajectoryOutputParser(BaseOutputParser):


 class TrajectoryEvalChain(Chain):
-    agent_tools: List[BaseTool]
+    """A chain for evaluating ReAct style agents.
+
+    This chain is used to evaluate ReAct style agents by reasoning about
+    the sequence of actions taken and their outcomes.
+
+    Example:
+        .. code-block:: python
+            from langchain.agents import AgentType, initialize_agent
+            from langchain.chat_models import ChatOpenAI
+            from langchain.evaluation import TrajectoryEvalChain
+            from langchain.tools import tool
+
+            @tool
+            def geography_answers(country: str, question: str) -> str:
+                \"\"\"Very helpful answers to geography questions.\"\"\"
+                return f"{country}? IDK - We may never know {question}."
+
+            llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0)
+            agent = initialize_agent(
+                tools=[geography_answers],
+                llm=llm,
+                agent=AgentType.OPENAI_FUNCTIONS,
+                return_intermediate_steps=True,
+            )
+
+            question = "How many dwell in the largest minor region in Argentina?"
+            response = agent(question)
+
+            eval_chain = TrajectoryEvalChain.from_llm(
+                llm=llm, agent_tools=[geography_answers], return_reasoning=True
+            )
+
+            result = eval_chain.evaluate_agent_trajectory(
+                input=question,
+                agent_trajectory=response["intermediate_steps"],
+                output=response["output"],
+                reference="Paris",
+            )
+            print(result["score"])
+            # 0
+    """  # noqa: E501
+
+    agent_tools: Optional[List[BaseTool]] = None
+    """A list of tools available to the agent."""
    eval_chain: LLMChain
-    output_parser: TrajectoryOutputParser
+    """The language model chain used for evaluation."""
+    output_parser: TrajectoryOutputParser = Field(
+        default_factory=TrajectoryOutputParser
+    )
+    """The output parser used to parse the output."""
    return_reasoning: bool = False
+    """Whether to return the reasoning along with the score."""

    @property
    def _tools_description(self) -> str:
+        """Get the description of the agent tools.
+
+        Returns:
+            str: The description of the agent tools.
+        """
+        if self.agent_tools is None:
+            return ""
        return "\n\n".join(
            [
                f"""Tool {i}: {tool.name}
@ -60,6 +142,14 @@ Description: {tool.description}"""

    @staticmethod
    def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
+        """Get the agent trajectory as a formatted string.
+
+        Args:
+            steps (Union[str, List[Tuple[AgentAction, str]]]): The agent trajectory.
+
+        Returns:
+            str: The formatted agent trajectory.
+        """
        if isinstance(steps, str):
            return steps

@ -73,15 +163,53 @@ Tool output: {output}"""
            ]
        )

+    @staticmethod
+    def _format_reference(reference: Optional[str]) -> str:
+        """Format the reference text.
+
+        Args:
+            reference (str): The reference text.
+
+        Returns:
+            str: The formatted reference text.
+        """
+        if not reference:
+            return ""
+        return f"""
+
+The following is the expected answer. Use this to measure correctness:
+[GROUND_TRUTH]
+{reference}
+[END_GROUND_TRUTH]
+"""
+
    @classmethod
    def from_llm(
        cls,
-        llm: ChatOpenAI,
-        agent_tools: Sequence[BaseTool],
+        llm: BaseChatModel,
+        agent_tools: Optional[Sequence[BaseTool]] = None,
        output_parser: Optional[TrajectoryOutputParser] = None,
        return_reasoning: bool = False,
    ) -> "TrajectoryEvalChain":
-        eval_chain = LLMChain(llm=llm, prompt=EVAL_CHAT_PROMPT)
+        """Create a TrajectoryEvalChain object from a language model chain.
+
+        Args:
+            llm (BaseChatModel): The language model chain.
+            agent_tools (Optional[Sequence[BaseTool]]): A list of tools
+                available tothe agent.
+            output_parser (Optional[TrajectoryOutputParser]): The output parser
+                used to parse the chain output into a score.
+            return_reasoning (bool): Whether to return the
+                reasoning along with the score.
+
+        Returns:
+            TrajectoryEvalChain: The TrajectoryEvalChain object.
+        """
+        if agent_tools:
+            prompt = EVAL_CHAT_PROMPT
+        else:
+            prompt = TOOL_FREE_EVAL_CHAT_PROMPT
+        eval_chain = LLMChain(llm=llm, prompt=prompt)
        return cls(
            agent_tools=agent_tools,
            return_reasoning=return_reasoning,
@ -91,25 +219,169 @@ Tool output: {output}"""

    @property
    def input_keys(self) -> List[str]:
-        return ["question", "agent_trajectory", "answer"]
+        """Get the input keys for the chain.
+
+        Returns:
+            List[str]: The input keys.
+        """
+        return ["question", "agent_trajectory", "answer", "reference"]

    @property
    def output_keys(self) -> List[str]:
+        """Get the output keys for the chain.
+
+        Returns:
+            List[str]: The output keys.
+        """
        if self.return_reasoning:
            return ["score", "reasoning"]
        return ["score"]

+    def __call__(
+        self,
+        inputs: Union[Dict[str, Any], Any],
+        return_only_outputs: bool = False,
+        callbacks: Callbacks = None,
+        *,
+        tags: Optional[List[str]] = None,
+        include_run_info: bool = False,
+    ) -> Dict[str, Any]:
+        """Run the logic of this chain and add to output if desired.
+
+        Args:
+            inputs: Dictionary of inputs, or single input if chain expects
+                only one param.
+            return_only_outputs: boolean for whether to return only outputs in the
+                response. If True, only new keys generated by this chain will be
+                returned. If False, both input keys and new keys generated by this
+                chain will be returned. Defaults to False.
+            callbacks: Callbacks to use for this chain run. If not provided, will
+                use the callbacks provided to the chain.
+            include_run_info: Whether to include run info in the response. Defaults
+                to False.
+        """
+        if "reference" not in inputs:
+            inputs["reference"] = ""
+        return super().__call__(
+            inputs=inputs,
+            return_only_outputs=return_only_outputs,
+            callbacks=callbacks,
+            tags=tags,
+            include_run_info=include_run_info,
+        )
+
    def _call(
        self,
        inputs: Dict[str, str],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, Any]:
-        raw_output = self.eval_chain.run(
-            {"tool_descriptions": self._tools_description, **inputs}
-        )
+        """Run the chain and generate the output.
+
+        Args:
+            inputs (Dict[str, str]): The input values for the chain.
+            run_manager (Optional[CallbackManagerForChainRun]): The callback
+                manager for the chain run.
+
+        Returns:
+            Dict[str, Any]: The output values of the chain.
+        """
+        chain_input = {**inputs}
+        if self.agent_tools:
+            chain_input["tool_descriptions"] = self._tools_description
+        raw_output = self.eval_chain.run(chain_input)
        parsed_output = self.output_parser.parse(raw_output)

        if self.return_reasoning:
            return {"score": parsed_output.score, "reasoning": parsed_output.reasoning}

        return {"score": parsed_output.score}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, str],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Run the chain and generate the output.
+
+        Args:
+            inputs (Dict[str, str]): The input values for the chain.
+            run_manager (Optional[CallbackManagerForChainRun]): The callback
+                manager for the chain run.
+
+        Returns:
+            Dict[str, Any]: The output values of the chain.
+        """
+        chain_input = {**inputs}
+        if self.agent_tools:
+            chain_input["tool_descriptions"] = self._tools_description
+        raw_output = await self.eval_chain.arun(chain_input)
+        parsed_output = self.output_parser.parse(raw_output)
+
+        if self.return_reasoning:
+            return {"score": parsed_output.score, "reasoning": parsed_output.reasoning}
+
+        return {"score": parsed_output.score}
+
+    def evaluate_agent_trajectory(
+        self,
+        *,
+        input: str,
+        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        output: str,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            input (str): The input question.
+            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+                The intermediate steps forming the agent trajectory.
+            output (str): The expected output.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        inputs = {
+            "question": input,
+            "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
+            "answer": output,
+            "reference": self._format_reference(reference),
+        }
+        return self(inputs=inputs, callbacks=callbacks, **kwargs)
+
+    async def aevaluate_agent_trajectory(
+        self,
+        *,
+        input: str,
+        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        output: str,
+        reference: Optional[str] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            input (str): The input question.
+            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+                The intermediate steps forming the agent trajectory.
+            output (str): The expected output.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        inputs = {
+            "question": input,
+            "agent_trajectory": self.get_agent_trajectory(agent_trajectory),
+            "answer": output,
+            "reference": self._format_reference(reference),
+        }
+        return await self.acall(
+            inputs=inputs,
+            callbacks=callbacks,
+            **kwargs,
+        )
--- a/langchain/evaluation/agents/trajectory_eval_prompt.py
+++ b/langchain/evaluation/agents/trajectory_eval_prompt.py
@ -13,16 +13,24 @@ from langchain.prompts.chat import (
 EVAL_TEMPLATE = """An AI language model has been given access to the following set of tools to help answer a user's question.

 The tools given to the AI model are:
-
+[TOOL_DESCRIPTIONS]
 {tool_descriptions}
+[END_TOOL_DESCRIPTIONS]

-The question the human asked the AI model was: {question}
+The question the human asked the AI model was:
+[QUESTION]
+{question}
+[END_QUESTION]{reference}

 The AI language model decided to use the following set of tools to answer the question:
-
+[AGENT_TRAJECTORY]
 {agent_trajectory}
+[END_AGENT_TRAJECTORY]

-The AI language model's final answer to the question was: {answer}
+The AI language model's final answer to the question was:
+[RESPONSE]
+{answer}
+[END_RESPONSE]

 Let's to do a detailed evaluation of the AI language model's answer step by step.

@ -37,7 +45,7 @@ v. Are the appropriate tools used to answer the question?"""
 EXAMPLE_INPUT = """An AI language model has been given acces to the following set of tools to help answer a user's question.

 The tools given to the AI model are:
-
+[TOOL_DESCRIPTIONS]
 Tool 1:
 Name: Search
 Description: useful for when you need to ask with search
@ -53,17 +61,21 @@ Description: useful for doing calculations
 Tool 4:
 Name: Search the Web (SerpAPI)
 Description: useful for when you need to answer questions about current events
+[END_TOOL_DESCRIPTIONS]

 The question the human asked the AI model was: If laid the Statue of Liberty end to end, how many times would it stretch across the United States?

 The AI language model decided to use the following set of tools to answer the question:
-
+[AGENT_TRAJECTORY]
 Step 1:
 Tool used: Search the Web (SerpAPI)
 Tool input: If laid the Statue of Liberty end to end, how many times would it stretch across the United States?
 Tool output: The Statue of Liberty was given to the United States by France, as a symbol of the two countries' friendship. It was erected atop an American-designed ...
+[END_AGENT_TRAJECTORY]

+[RESPONSE]
 The AI language model's final answer to the question was: There are different ways to measure the length of the United States, but if we use the distance between the Statue of Liberty and the westernmost point of the contiguous United States (Cape Alava, Washington), which is approximately 2,857 miles (4,596 km), and assume that the Statue of Liberty is 305 feet (93 meters) tall, then the statue would stretch across the United States approximately 17.5 times if laid end to end.
+[END_RESPONSE]

 Let's to do a detailed evaluation of the AI language model's answer step by step.

@ -96,3 +108,43 @@ EVAL_CHAT_PROMPT = ChatPromptTemplate.from_messages(
        HumanMessagePromptTemplate.from_template(EVAL_TEMPLATE),
    ]
 )
+
+
+TOOL_FREE_EVAL_TEMPLATE = """An AI language model has been given access to a set of tools to help answer a user's question.
+
+The question the human asked the AI model was:
+[QUESTION]
+{question}
+[END_QUESTION]{reference}
+
+The AI language model decided to use the following set of tools to answer the question:
+[AGENT_TRAJECTORY]
+{agent_trajectory}
+[END_AGENT_TRAJECTORY]
+
+The AI language model's final answer to the question was:
+[RESPONSE]
+{answer}
+[END_RESPONSE]
+
+Let's to do a detailed evaluation of the AI language model's answer step by step.
+
+We consider the following criteria before giving a score from 1 to 5:
+
+i. Is the final answer helpful?
+ii. Does the AI language use a logical sequence of tools to answer the question?
+iii. Does the AI language model use the tools in a helpful way?
+iv. Does the AI language model use too many steps to answer the question?
+v. Are the appropriate tools used to answer the question?"""
+
+
+TOOL_FREE_EVAL_CHAT_PROMPT = ChatPromptTemplate.from_messages(
+    messages=[
+        SystemMessage(
+            content="You are a helpful assistant that evaluates language models."
+        ),
+        HumanMessage(content=EXAMPLE_INPUT),
+        AIMessage(content=EXAMPLE_OUTPUT),
+        HumanMessagePromptTemplate.from_template(TOOL_FREE_EVAL_TEMPLATE),
+    ]
+)
--- a/tests/unit_tests/evaluation/agents/init.py
+++ b/tests/unit_tests/evaluation/agents/init.py
--- a/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py
@ -0,0 +1,113 @@
+"""Test agent trajectory evaluation chain."""
+
+from typing import List, Tuple
+
+import pytest
+
+from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
+from langchain.schema import AgentAction
+from langchain.tools.base import tool
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+@pytest.fixture
+def intermediate_steps() -> List[Tuple[AgentAction, str]]:
+    return [
+        (
+            AgentAction(
+                tool="Foo",
+                tool_input="Bar",
+                log="Star date 2021-06-13: Foo received input: Bar",
+            ),
+            "Baz",
+        ),
+    ]
+
+
+@tool
+def foo(bar: str) -> str:
+    """Foo."""
+    return bar
+
+
+def test_trajectory_eval_chain(
+    intermediate_steps: List[Tuple[AgentAction, str]]
+) -> None:
+    llm = FakeLLM(
+        queries={
+            "a": "Trajectory good\nScore: 5",
+            "b": "Trajectory not good\nScore: 1",
+        },
+        sequential_responses=True,
+    )
+    chain = TrajectoryEvalChain.from_llm(llm=llm, agent_tools=[foo])  # type: ignore
+    # Test when ref is not provided
+    res = chain.evaluate_agent_trajectory(
+        input="What is your favorite food?",
+        agent_trajectory=intermediate_steps,
+        output="I like pie.",
+    )
+    assert res["score"] == 5
+    # Test when ref is provided
+    res = chain.evaluate_agent_trajectory(
+        input="What is your favorite food?",
+        agent_trajectory=intermediate_steps,
+        output="I like pie.",
+        reference="Paris",
+    )
+    assert res["score"] == 1
+
+
+def test_trajectory_eval_chain_no_tools(
+    intermediate_steps: List[Tuple[AgentAction, str]]
+) -> None:
+    llm = FakeLLM(
+        queries={
+            "a": "Trajectory good\nScore: 5",
+            "b": "Trajectory not good\nScore: 1",
+        },
+        sequential_responses=True,
+    )
+    chain = TrajectoryEvalChain.from_llm(llm=llm)  # type: ignore
+    res = chain.evaluate_agent_trajectory(
+        input="What is your favorite food?",
+        agent_trajectory=intermediate_steps,
+        output="I like pie.",
+    )
+    assert res["score"] == 5
+    res = chain.evaluate_agent_trajectory(
+        input="What is your favorite food?",
+        agent_trajectory=intermediate_steps,
+        output="I like pie.",
+        reference="Paris",
+    )
+    assert res["score"] == 1
+
+
+def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> None:
+    llm = FakeLLM(
+        queries={
+            "a": "Trajectory good\nScore: 5",
+            "b": "Trajectory not good\nScore: 1",
+        },
+        sequential_responses=True,
+    )
+    chain = TrajectoryEvalChain.from_llm(llm=llm)  # type: ignore
+    res = chain(
+        {
+            "question": "What is your favorite food?",
+            "agent_trajectory": intermediate_steps,
+            "answer": "I like pie.",
+        }
+    )
+    assert res["score"] == 5
+
+    res = chain(
+        {
+            "question": "What is your favorite food?",
+            "agent_trajectory": intermediate_steps,
+            "answer": "I like pie.",
+            "reference": "Paris",
+        }
+    )
+    assert res["score"] == 1