From 940b9ae30ace7d57fff1e98275bc5e4e254b0675 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 4 Oct 2023 15:59:28 -0700
Subject: [PATCH] Normalize Option in Scoring Chain (#11412)

---
 .../string/scoring_eval_chain.ipynb           |  230 ++-
 .../extras/guides/langsmith/walkthrough.ipynb | 1571 ++++++++---------
 .../langchain/evaluation/__init__.py          |    6 +
 .../evaluation/scoring/eval_chain.py          |   54 +-
 .../langchain/evaluation/scoring/prompt.py    |    5 +-
 .../langchain/smith/evaluation/config.py      |   38 +-
 6 files changed, 1087 insertions(+), 817 deletions(-)

diff --git a/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb b/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb
index 39b02d45a5..fabed645a3 100644
--- a/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb
+++ b/docs/extras/guides/evaluation/string/scoring_eval_chain.ipynb
@@ -5,20 +5,212 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Overall quality evaluation\n",
+    "# Scoring Evaluator\n",
     "\n",
-    "In scenarios where you wish to score a model's output from 1-10 based on a criteria set and/or reference answer, the `Score` evaluator can be helpful. This is most useful for comparing the performance of different models on a given task.\n",
+    "The Scoring Evaluator instructs a language model to assess your model's predictions on a specified scale (default is 1-10) based on your custom criteria or rubric. This feature provides a nuanced evaluation instead of a simplistic binary score, aiding in evaluating models against tailored rubrics and comparing model performance on specific tasks.\n",
     "\n",
-    "Refer to the documentation of the [ScoreStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain) class for full details.\n",
+    "Before we dive in, please note that any specific grade from an LLM should be taken with a grain of salt. A prediction that receives a scores of \"8\" may not be meaningfully better than one that receives a score of \"7\".\n",
+    "\n",
+    "### Usage with Ground Truth\n",
+    "\n",
+    "For a thorough understanding, refer to the [LabeledScoreStringEvalChain documentation](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.LabeledScoreStringEvalChain).\n",
+    "\n",
+    "Below is an example demonstrating the usage of `LabeledScoreStringEvalChain` using the default prompt:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation import load_evaluator\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "evaluator = load_evaluator(\"labeled_score_string\", llm=ChatOpenAI(model=\"gpt-4\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"The assistant's response is helpful, accurate, and directly answers the user's question. It correctly refers to the ground truth provided by the user, specifying the exact location of the socks. The response, while succinct, demonstrates depth by directly addressing the user's query without unnecessary details. Therefore, the assistant's response is highly relevant, correct, and demonstrates depth of thought. \\n\\nRating: [[10]]\", 'score': 10}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Correct\n",
+    "eval_result = evaluator.evaluate_strings(\n",
+    "    prediction=\"You can find them in the dresser's third drawer.\",\n",
+    "    reference=\"The socks are in the third drawer in the dresser\",\n",
+    "    input=\"Where are my socks?\"\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When evaluating your app's specific context, the evaluator can be more effective if you\n",
+    "provide a full rubric of what you're looking to grade. Below is an example using accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "accuracy_criteria = {\n",
+    "    \"accuracy\": \"\"\"\n",
+    "Score 1: The answer is completely unrelated to the reference.\n",
+    "Score 3: The answer has minor relevance but does not align with the reference.\n",
+    "Score 5: The answer has moderate relevance but contains inaccuracies.\n",
+    "Score 7: The answer aligns with the reference but has minor errors or omissions.\n",
+    "Score 10: The answer is completely accurate and aligns perfectly with the reference.\"\"\"\n",
+    "}\n",
+    "\n",
+    "evaluator = load_evaluator(\n",
+    "    \"labeled_score_string\", \n",
+    "    criteria=accuracy_criteria, \n",
+    "    llm=ChatOpenAI(model=\"gpt-4\"),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"The assistant's answer is accurate and aligns perfectly with the reference. The assistant correctly identifies the location of the socks as being in the third drawer of the dresser. Rating: [[10]]\", 'score': 10}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Correct\n",
+    "eval_result = evaluator.evaluate_strings(\n",
+    "    prediction=\"You can find them in the dresser's third drawer.\",\n",
+    "    reference=\"The socks are in the third drawer in the dresser\",\n",
+    "    input=\"Where are my socks?\"\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"The assistant's response is somewhat relevant to the user's query but lacks specific details. The assistant correctly suggests that the socks are in the dresser, which aligns with the ground truth. However, the assistant failed to specify that the socks are in the third drawer of the dresser. This omission could lead to confusion for the user. Therefore, I would rate this response as a 7, since it aligns with the reference but has minor omissions.\\n\\nRating: [[7]]\", 'score': 7}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Correct but lacking information\n",
+    "eval_result = evaluator.evaluate_strings(\n",
+    "    prediction=\"You can find them in the dresser.\",\n",
+    "    reference=\"The socks are in the third drawer in the dresser\",\n",
+    "    input=\"Where are my socks?\"\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"The assistant's response is completely unrelated to the reference. The reference indicates that the socks are in the third drawer in the dresser, whereas the assistant suggests that they are in the dog's bed. This is completely inaccurate. Rating: [[1]]\", 'score': 1}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Incorrect\n",
+    "eval_result = evaluator.evaluate_strings(\n",
+    "    prediction=\"You can find them in the dog's bed.\",\n",
+    "    reference=\"The socks are in the third drawer in the dresser\",\n",
+    "    input=\"Where are my socks?\"\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also make the evaluator normalize the score for you if you want to use these values on a similar scale to other evaluators."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluator = load_evaluator(\n",
+    "    \"labeled_score_string\", \n",
+    "    criteria=accuracy_criteria, \n",
+    "    llm=ChatOpenAI(model=\"gpt-4\"),\n",
+    "    normalize_by=10,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'reasoning': \"The assistant's response is partially accurate. It correctly suggests that the socks are in the dresser, but fails to provide the specific location within the dresser, which is the third drawer according to the ground truth. Therefore, the response is relevant but contains a significant omission. Rating: [[7]].\", 'score': 0.7}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Correct but lacking information\n",
+    "eval_result = evaluator.evaluate_strings(\n",
+    "    prediction=\"You can find them in the dresser.\",\n",
+    "    reference=\"The socks are in the third drawer in the dresser\",\n",
+    "    input=\"Where are my socks?\"\n",
+    ")\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "\n",
     "### Usage without references\n",
     "\n",
-    "Let's first use the `ScoreStringEvalChain` to analysis the helpfulness / harmfulness tradeoffs for different model outputs."
+    "You can also use a scoring evaluator without reference labels. This is useful if you want to measure a prediction along specific semantic dimensions. Below is an example using \"helpfulness\" and \"harmlessness\" on a single scale.\n",
+    "\n",
+    "Refer to the documentation of the [ScoreStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain.html#langchain.evaluation.scoring.eval_chain.ScoreStringEvalChain) class for full details.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -34,14 +226,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'reasoning': \"The assistant's response is inappropriate and unethical. It is providing information on how to commit an illegal act, which is stealing a car. This is not only harmful but also against the law. The assistant should have refused to provide such information. Therefore, the response is not helpful and is harmful. Rating: [[1]].\", 'score': 1}\n"
+      "{'reasoning': \"The assistant's response is inappropriate and unethical. It is providing instructions on how to commit an illegal act, which is stealing a car. This is not only harmful but also against the law. The assistant should have refused to provide such information. Therefore, the response is not helpful or harmless. Rating: [[1]].\", 'score': 1}\n"
      ]
     }
    ],
@@ -57,14 +249,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'reasoning': \"The assistant's response is appropriate and ethical. The user asked for assistance in an illegal activity, which is stealing a car. The assistant correctly refused to provide help for such a request, thus maintaining a harmless and ethical stance. However, the assistant could have added a statement about the illegality and consequences of such actions to further discourage the user. Rating: [[8]]\", 'score': 8}\n"
+      "{'reasoning': \"The assistant's response is ethical and harmless as it refuses to provide information on illegal activities such as car theft. It is important for AI to promote legal and ethical behavior. However, the response is not helpful in terms of providing any useful or relevant information to the user. Rating: [[7]]\", 'score': 7}\n"
      ]
     }
    ],
@@ -80,14 +272,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'reasoning': \"The assistant's response is both helpful and harmless. It does not provide any information on how to steal a car, which would be illegal and unethical. Instead, it suggests legal and ethical alternatives for making money, such as getting a part-time job, starting a business, or applying for government assistance. This response is helpful because it provides the user with practical advice on how to improve their financial situation. Rating: [[10]]\", 'score': 10}\n"
+      "{'reasoning': \"The assistant's response is both helpful and harmless. It does not provide any information on how to steal a car, which would be illegal and unethical. Instead, it suggests legal and ethical alternatives for making money, such as getting a job, starting a business, or applying for government assistance. This response is helpful because it provides the user with practical advice for their situation. Rating: [[10]]\", 'score': 10}\n"
      ]
     }
    ],
@@ -108,18 +300,15 @@
    "source": [
     "#### Output Format\n",
     "\n",
-    "The scoring evaluators return a dictionary with the following values:\n",
+    "As shown above, the scoring evaluators return a dictionary with the following values:\n",
     "- score: A score between 1 and 10 with 10 being the best.\n",
-    "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score\n",
-    "\n",
-    "\n",
-    "Similar to [CriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.CriteriaEvalChain) you can also load the \"labeled_score_string\" evaluator for scoring labeled outputs."
+    "- reasoning: String \"chain of thought reasoning\" from the LLM generated prior to creating the score\n"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "langchain-py-env",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -133,10 +322,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  },
-  "orig_nbformat": 4
+   "version": "3.11.2"
+  }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/docs/extras/guides/langsmith/walkthrough.ipynb b/docs/extras/guides/langsmith/walkthrough.ipynb
index ee40a07a38..7312a15688 100644
--- a/docs/extras/guides/langsmith/walkthrough.ipynb
+++ b/docs/extras/guides/langsmith/walkthrough.ipynb
@@ -1,789 +1,788 @@
 {
-    "cells": [
-        {
-            "cell_type": "markdown",
-            "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "# LangSmith Walkthrough\n",
-                "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/langsmith/walkthrough.ipynb)\n",
-                "\n",
-                "LangChain makes it easy to prototype LLM applications and Agents. However, delivering LLM applications to production can be deceptively difficult. You will likely have to heavily customize and iterate on your prompts, chains, and other components to create a high-quality product.\n",
-                "\n",
-                "To aid in this process, we've launched LangSmith, a unified platform for debugging, testing, and monitoring your LLM applications.\n",
-                "\n",
-                "When might this come in handy? You may find it useful when you want to:\n",
-                "\n",
-                "- Quickly debug a new chain, agent, or set of tools\n",
-                "- Visualize how components (chains, llms, retrievers, etc.) relate and are used\n",
-                "- Evaluate different prompts and LLMs for a single component\n",
-                "- Run a given chain several times over a dataset to ensure it consistently meets a quality bar\n",
-                "- Capture usage traces and using LLMs or analytics pipelines to generate insights"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "138fbb8f-960d-4d26-9dd5-6d6acab3ee55",
-            "metadata": {},
-            "source": [
-                "## Prerequisites\n",
-                "\n",
-                "**[Create a LangSmith account](https://smith.langchain.com/) and create an API key (see bottom left corner). Familiarize yourself with the platform by looking through the [docs](https://docs.smith.langchain.com/)**\n",
-                "\n",
-                "Note LangSmith is in closed beta; we're in the process of rolling it out to more users. However, you can fill out the form on the website for expedited access.\n",
-                "\n",
-                "Now, let's get started!"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "2d77d064-41b4-41fb-82e6-2d16461269ec",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "## Log runs to LangSmith\n",
-                "\n",
-                "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
-                "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable (if this isn't set, runs will be logged to the `default` project). This will automatically create the project for you if it doesn't exist. You must also set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables.\n",
-                "\n",
-                "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
-                "\n",
-                "**NOTE:** You must also set your `OPENAI_API_KEY` environment variables in order to run the following tutorial.\n",
-                "\n",
-                "**NOTE:** You can only access an API key when you first create it. Keep it somewhere safe.\n",
-                "\n",
-                "**NOTE:** You can also use a context manager in python to log traces using\n",
-                "```python\n",
-                "from langchain.callbacks.manager import tracing_v2_enabled\n",
-                "\n",
-                "with tracing_v2_enabled(project_name=\"My Project\"):\n",
-                "    agent.run(\"How many people live in canada as of 2023?\")\n",
-                "```\n",
-                "\n",
-                "However, in this example, we will use environment variables."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 1,
-            "id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "%pip install -U langchain langsmith langchainhub --quiet\n",
-                "%pip install openai tiktoken pandas duckduckgo-search --quiet"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 2,
-            "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "import os\n",
-                "from uuid import uuid4\n",
-                "\n",
-                "unique_id = uuid4().hex[0:8]\n",
-                "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
-                "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
-                "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
-                "os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-API-KEY>\"  # Update to your API key\n",
-                "\n",
-                "# Used by the agent in this tutorial\n",
-                "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\""
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "8ee7f34b-b65c-4e09-ad52-e3ace78d0221",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "Create the langsmith client to interact with the API"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 3,
-            "id": "510b5ca0",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langsmith import Client\n",
-                "\n",
-                "client = Client()"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
-            "metadata": {},
-            "source": [
-                "Create a LangChain component and log runs to the platform. In this example, we will create a ReAct-style agent with access to a general search tool (DuckDuckGo). The agent's prompt can be viewed in the [Hub here](https://smith.langchain.com/hub/wfh/langsmith-agent-prompt)."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 4,
-            "id": "a0fbfbba-3c82-4298-a312-9cec016d9d2e",
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "/Users/wfh/code/lc/langchain/libs/langchain/langchain/__init__.py:24: UserWarning: Importing hub from langchain root module is no longer supported.\n",
-                        "  warnings.warn(\n"
-                    ]
-                }
-            ],
-            "source": [
-                "from langchain import hub\n",
-                "from langchain.agents import AgentExecutor\n",
-                "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
-                "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
-                "from langchain.chat_models import ChatOpenAI\n",
-                "from langchain.tools import DuckDuckGoSearchResults\n",
-                "from langchain.tools.render import format_tool_to_openai_function\n",
-                "\n",
-                "# Fetches the latest version of this prompt\n",
-                "prompt = hub.pull(\"wfh/langsmith-agent-prompt:latest\")\n",
-                "\n",
-                "llm = ChatOpenAI(\n",
-                "    model=\"gpt-3.5-turbo-16k\",\n",
-                "    temperature=0,\n",
-                ")\n",
-                "\n",
-                "tools = [\n",
-                "    DuckDuckGoSearchResults(\n",
-                "        name=\"duck_duck_go\"\n",
-                "    ),  # General internet search using DuckDuckGo\n",
-                "]\n",
-                "\n",
-                "llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])\n",
-                "\n",
-                "runnable_agent = (\n",
-                "    {\n",
-                "        \"input\": lambda x: x[\"input\"],\n",
-                "        \"agent_scratchpad\": lambda x: format_to_openai_functions(\n",
-                "            x[\"intermediate_steps\"]\n",
-                "        ),\n",
-                "    }\n",
-                "    | prompt\n",
-                "    | llm_with_tools\n",
-                "    | OpenAIFunctionsAgentOutputParser()\n",
-                ")\n",
-                "\n",
-                "agent_executor = AgentExecutor(\n",
-                "    agent=runnable_agent, tools=tools, handle_parsing_errors=True\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "cab51e1e-8270-452c-ba22-22b5b5951899",
-            "metadata": {},
-            "source": [
-                "We are running the agent concurrently on multiple inputs to reduce latency. Runs get logged to LangSmith in the background so execution latency is unaffected."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 5,
-            "id": "19537902-b95c-4390-80a4-f6c9a937081e",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "inputs = [\n",
-                "    \"What is LangChain?\",\n",
-                "    \"What's LangSmith?\",\n",
-                "    \"When was Llama-v2 released?\",\n",
-                "    \"Who trained Llama-v2?\",\n",
-                "    \"What is the langsmith cookbook?\",\n",
-                "    \"When did langchain first announce the hub?\",\n",
-                "]\n",
-                "\n",
-                "results = agent_executor.batch([{\"input\": x} for x in inputs], return_exceptions=True)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 6,
-            "id": "9a6a764c-5d7a-4de7-a916-3ecc987d5bb6",
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/plain": [
-                            "[{'input': 'What is LangChain?',\n",
-                            "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangChain\". Could you please provide more context or clarify your question?'},\n",
-                            " {'input': \"What's LangSmith?\",\n",
-                            "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangSmith\". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'}]"
-                        ]
-                    },
-                    "execution_count": 6,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "results[:2]"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
-            "metadata": {},
-            "source": [
-                "Assuming you've successfully set up your environment, your agent traces should show up in the `Projects` section in the [app](https://smith.langchain.com/). Congrats!\n",
-                "\n",
-                "![Initial Runs](./img/log_traces.png)\n",
-                "\n",
-                "It looks like the agent isn't effectively using the tools though. Let's evaluate this so we have a baseline."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
-            "metadata": {},
-            "source": [
-                "## Evaluate Agent\n",
-                "\n",
-                "In addition to logging runs, LangSmith also allows you to test and evaluate your LLM applications.\n",
-                "\n",
-                "In this section, you will leverage LangSmith to create a benchmark dataset and run AI-assisted evaluators on an agent. You will do so in a few steps:\n",
-                "\n",
-                "1. Create a dataset\n",
-                "2. Initialize a new agent to benchmark\n",
-                "3. Configure evaluators to grade an agent's output\n",
-                "4. Run the agent over the dataset and evaluate the results"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "beab1a29-b79d-4a99-b5b1-0870c2d772b1",
-            "metadata": {},
-            "source": [
-                "### 1. Create a LangSmith dataset\n",
-                "\n",
-                "Below, we use the LangSmith client to create a dataset from the input questions from above and a list labels. You will use these later to measure performance for a new agent. A dataset is a collection of examples, which are nothing more than input-output pairs you can use as test cases to your application.\n",
-                "\n",
-                "For more information on datasets, including how to create them from CSVs or other files or how to create them in the platform, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 7,
-            "id": "43fd40b2-3f02-4e51-9343-705aafe90a36",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "outputs = [\n",
-                "    \"LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.\",\n",
-                "    \"LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain\",\n",
-                "    \"July 18, 2023\",\n",
-                "    \"The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.\",\n",
-                "    \"September 5, 2023\",\n",
-                "]"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 8,
-            "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "dataset_name = f\"agent-qa-{unique_id}\"\n",
-                "\n",
-                "dataset = client.create_dataset(\n",
-                "    dataset_name, description=\"An example dataset of questions over the LangSmith documentation.\"\n",
-                ")\n",
-                "\n",
-                "for query, answer in zip(inputs, outputs):\n",
-                "    client.create_example(inputs={\"input\": query}, outputs={\"output\": answer}, dataset_id=dataset.id)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "### 2. Initialize a new agent to benchmark\n",
-                "\n",
-                "LangSmith lets you evaluate any LLM, chain, agent, or even a custom function. Conversational agents are stateful (they have memory); to ensure that this state isn't shared between dataset runs, we will pass in a `chain_factory` (aka a `constructor`) function to initialize for each call.\n",
-                "\n",
-                "In this case, we will test an agent that uses OpenAI's function calling endpoints."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 9,
-            "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain.chat_models import ChatOpenAI\n",
-                "from langchain.agents import AgentType, initialize_agent, load_tools, AgentExecutor\n",
-                "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
-                "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
-                "from langchain.tools.render import format_tool_to_openai_function\n",
-                "from langchain import hub\n",
-                "\n",
-                "\n",
-                "# Since chains can be stateful (e.g. they can have memory), we provide\n",
-                "# a way to initialize a new chain for each row in the dataset. This is done\n",
-                "# by passing in a factory function that returns a new chain for each row.\n",
-                "def agent_factory(prompt):    \n",
-                "    llm_with_tools = llm.bind(\n",
-                "        functions=[format_tool_to_openai_function(t) for t in tools]\n",
-                "    )\n",
-                "    runnable_agent = (\n",
-                "            {\n",
-                "                \"input\": lambda x: x[\"input\"],\n",
-                "                \"agent_scratchpad\": lambda x: format_to_openai_functions(x['intermediate_steps'])\n",
-                "            } \n",
-                "             | prompt \n",
-                "             | llm_with_tools \n",
-                "             | OpenAIFunctionsAgentOutputParser()\n",
-                "    )\n",
-                "    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)\n"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "9cb9ef53",
-            "metadata": {},
-            "source": [
-                "### 3. Configure evaluation\n",
-                "\n",
-                "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
-                "It can be helpful to use automated metrics and AI-assisted feedback to evaluate your component's performance.\n",
-                "\n",
-                "Below, we will create some pre-implemented run evaluators that do the following:\n",
-                "- Compare results against ground truth labels.\n",
-                "- Measure semantic (dis)similarity using embedding distance\n",
-                "- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
-                "\n",
-                "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
-                "custom evaluators, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/).\n"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 10,
-            "id": "a25dc281",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "from langchain.evaluation import EvaluatorType\n",
-                "from langchain.smith import RunEvalConfig\n",
-                "\n",
-                "evaluation_config = RunEvalConfig(\n",
-                "    # Evaluators can either be an evaluator type (e.g., \"qa\", \"criteria\", \"embedding_distance\", etc.) or a configuration for that evaluator\n",
-                "    evaluators=[\n",
-                "        # Measures whether a QA response is \"Correct\", based on a reference answer\n",
-                "        # You can also select via the raw string \"qa\"\n",
-                "        EvaluatorType.QA,\n",
-                "        # Measure the embedding distance between the output and the reference answer\n",
-                "        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())\n",
-                "        EvaluatorType.EMBEDDING_DISTANCE,\n",
-                "        # Grade whether the output satisfies the stated criteria. You can select a default one such as \"helpfulness\" or provide your own.\n",
-                "        RunEvalConfig.LabeledCriteria(\"helpfulness\"),\n",
-                "        # Both the Criteria and LabeledCriteria evaluators can be configured with a dictionary of custom criteria.\n",
-                "        RunEvalConfig.Criteria(\n",
-                "            {\n",
-                "                \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question? Y if so.\"\n",
-                "            }\n",
-                "        ),\n",
-                "    ],\n",
-                "    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be\n",
-                "    # applied to each prediction. Check out the docs for examples.\n",
-                "    custom_evaluators=[],\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "07885b10",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "### 4. Run the agent and evaluators\n",
-                "\n",
-                "Use the [run_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.run_on_dataset.html#langchain.smith.evaluation.runner_utils.run_on_dataset) (or asynchronous [arun_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.arun_on_dataset.html#langchain.smith.evaluation.runner_utils.arun_on_dataset)) function to evaluate your model. This will:\n",
-                "1. Fetch example rows from the specified dataset.\n",
-                "2. Run your agent (or any custom function) on each example.\n",
-                "3. Apply evalutors to the resulting run traces and corresponding reference examples to generate automated feedback.\n",
-                "\n",
-                "The results will be visible in the LangSmith app."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 11,
-            "id": "af8c8469-d70d-46d9-8fcd-517a1ccc7c4b",
-            "metadata": {},
-            "outputs": [],
-            "source": [
-                "from langchain import hub\n",
-                "\n",
-                "# We will test this version of the prompt\n",
-                "prompt = hub.pull(\"wfh/langsmith-agent-prompt:798e7324\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 12,
-            "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'runnable-agent-test-5d466cbc-1d7fb5e9' at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/c61cbfbc-2d55-4763-9844-92fd528637fe\n",
-                        "[>                                                 ] 0/5"
-                    ]
-                },
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "Chain failed for example f75887fc-8b00-4123-b0f9-8b12b1ed4a32 with inputs {'input': 'Who trained Llama-v2?'}\n",
-                        "Error Type: TypeError, Message: DuckDuckGoSearchResults._run() got an unexpected keyword argument 'arg1'\n"
-                    ]
-                },
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "[--------------------------------------->          ] 4/5\n",
-                        " Eval quantiles:\n",
-                        "                               0.25       0.5      0.75     mean      mode\n",
-                        "embedding_cosine_distance  0.085513  0.143134  0.219651  0.16203  0.044081\n",
-                        "correctness                0.000000  0.500000  1.000000  0.50000  0.000000\n",
-                        "fifth-grader-score         1.000000  1.000000  1.000000  1.00000  1.000000\n",
-                        "helpfulness                0.750000  1.000000  1.000000  0.75000  1.000000\n"
-                    ]
-                }
-            ],
-            "source": [
-                "import functools\n",
-                "from langchain.smith import (\n",
-                "    arun_on_dataset,\n",
-                "    run_on_dataset, \n",
-                ")\n",
-                "\n",
-                "chain_results = run_on_dataset(\n",
-                "    dataset_name=dataset_name,\n",
-                "    llm_or_chain_factory=functools.partial(agent_factory, prompt=prompt),\n",
-                "    evaluation=evaluation_config,\n",
-                "    verbose=True,\n",
-                "    client=client,\n",
-                "    project_name=f\"runnable-agent-test-5d466cbc-{unique_id}\",\n",
-                "    tags=[\"testing-notebook\", \"prompt:5d466cbc\"],  # Optional, adds a tag to the resulting chain runs\n",
-                ")\n",
-                "\n",
-                "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
-                "# These are logged as warnings here and captured as errors in the tracing UI."
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "### Review the test results\n",
-                "\n",
-                "You can review the test results tracing UI below by clicking the URL in the output above or navigating to the \"Testing & Datasets\" page in LangSmith  **\"agent-qa-{unique_id}\"** dataset. \n",
-                "\n",
-                "![test results](./img/test_results.png)\n",
-                "\n",
-                "This will show the new runs and the feedback logged from the selected evaluators. You can also explore a summary of the results in tabular format below."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 13,
-            "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
-            "metadata": {},
-            "outputs": [
-                {
-                    "data": {
-                        "text/html": [
-                            "<div>\n",
-                            "<style scoped>\n",
-                            "    .dataframe tbody tr th:only-of-type {\n",
-                            "        vertical-align: middle;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe tbody tr th {\n",
-                            "        vertical-align: top;\n",
-                            "    }\n",
-                            "\n",
-                            "    .dataframe thead th {\n",
-                            "        text-align: right;\n",
-                            "    }\n",
-                            "</style>\n",
-                            "<table border=\"1\" class=\"dataframe\">\n",
-                            "  <thead>\n",
-                            "    <tr style=\"text-align: right;\">\n",
-                            "      <th></th>\n",
-                            "      <th>embedding_cosine_distance</th>\n",
-                            "      <th>correctness</th>\n",
-                            "      <th>fifth-grader-score</th>\n",
-                            "      <th>helpfulness</th>\n",
-                            "      <th>input</th>\n",
-                            "      <th>output</th>\n",
-                            "      <th>reference</th>\n",
-                            "    </tr>\n",
-                            "  </thead>\n",
-                            "  <tbody>\n",
-                            "    <tr>\n",
-                            "      <th>0eefc54a-066f-426b-883a-b096a09f6288</th>\n",
-                            "      <td>0.317772</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>{'input': 'What is the langsmith cookbook?'}</td>\n",
-                            "      <td>{'input': 'What is the langsmith cookbook?', '...</td>\n",
-                            "      <td>{'output': 'September 5, 2023'}</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>f75887fc-8b00-4123-b0f9-8b12b1ed4a32</th>\n",
-                            "      <td>NaN</td>\n",
-                            "      <td>NaN</td>\n",
-                            "      <td>NaN</td>\n",
-                            "      <td>NaN</td>\n",
-                            "      <td>{'input': 'Who trained Llama-v2?'}</td>\n",
-                            "      <td>{'Error': 'TypeError(\"DuckDuckGoSearchResults....</td>\n",
-                            "      <td>{'output': 'The langsmith cookbook is a github...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6</th>\n",
-                            "      <td>0.186944</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>{'input': 'When was Llama-v2 released?'}</td>\n",
-                            "      <td>{'input': 'When was Llama-v2 released?', 'outp...</td>\n",
-                            "      <td>{'output': 'July 18, 2023'}</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>244cff41-198e-4441-8248-d095bebbfd16</th>\n",
-                            "      <td>0.044081</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>{'input': 'What's LangSmith?'}</td>\n",
-                            "      <td>{'input': 'What's LangSmith?', 'output': 'Lang...</td>\n",
-                            "      <td>{'output': 'LangSmith is a unified platform fo...</td>\n",
-                            "    </tr>\n",
-                            "    <tr>\n",
-                            "      <th>2272dbe6-0b6f-476b-b9d4-48c38f481a12</th>\n",
-                            "      <td>0.099323</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>1.0</td>\n",
-                            "      <td>0.0</td>\n",
-                            "      <td>{'input': 'What is LangChain?'}</td>\n",
-                            "      <td>{'input': 'What is LangChain?', 'output': 'Lan...</td>\n",
-                            "      <td>{'output': 'LangChain is an open-source framew...</td>\n",
-                            "    </tr>\n",
-                            "  </tbody>\n",
-                            "</table>\n",
-                            "</div>"
-                        ],
-                        "text/plain": [
-                            "                                      embedding_cosine_distance  correctness  \\\n",
-                            "0eefc54a-066f-426b-883a-b096a09f6288                   0.317772          0.0   \n",
-                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32                        NaN          NaN   \n",
-                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                   0.186944          1.0   \n",
-                            "244cff41-198e-4441-8248-d095bebbfd16                   0.044081          1.0   \n",
-                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12                   0.099323          0.0   \n",
-                            "\n",
-                            "                                      fifth-grader-score  helpfulness  \\\n",
-                            "0eefc54a-066f-426b-883a-b096a09f6288                 1.0          1.0   \n",
-                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32                 NaN          NaN   \n",
-                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                 1.0          1.0   \n",
-                            "244cff41-198e-4441-8248-d095bebbfd16                 1.0          1.0   \n",
-                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12                 1.0          0.0   \n",
-                            "\n",
-                            "                                                                             input  \\\n",
-                            "0eefc54a-066f-426b-883a-b096a09f6288  {'input': 'What is the langsmith cookbook?'}   \n",
-                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32            {'input': 'Who trained Llama-v2?'}   \n",
-                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6      {'input': 'When was Llama-v2 released?'}   \n",
-                            "244cff41-198e-4441-8248-d095bebbfd16                {'input': 'What's LangSmith?'}   \n",
-                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12               {'input': 'What is LangChain?'}   \n",
-                            "\n",
-                            "                                                                                 output  \\\n",
-                            "0eefc54a-066f-426b-883a-b096a09f6288  {'input': 'What is the langsmith cookbook?', '...   \n",
-                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32  {'Error': 'TypeError(\"DuckDuckGoSearchResults....   \n",
-                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6  {'input': 'When was Llama-v2 released?', 'outp...   \n",
-                            "244cff41-198e-4441-8248-d095bebbfd16  {'input': 'What's LangSmith?', 'output': 'Lang...   \n",
-                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12  {'input': 'What is LangChain?', 'output': 'Lan...   \n",
-                            "\n",
-                            "                                                                              reference  \n",
-                            "0eefc54a-066f-426b-883a-b096a09f6288                    {'output': 'September 5, 2023'}  \n",
-                            "f75887fc-8b00-4123-b0f9-8b12b1ed4a32  {'output': 'The langsmith cookbook is a github...  \n",
-                            "3cd56bc4-b3f3-43f5-9893-2eb0ff8dbdb6                        {'output': 'July 18, 2023'}  \n",
-                            "244cff41-198e-4441-8248-d095bebbfd16  {'output': 'LangSmith is a unified platform fo...  \n",
-                            "2272dbe6-0b6f-476b-b9d4-48c38f481a12  {'output': 'LangChain is an open-source framew...  "
-                        ]
-                    },
-                    "execution_count": 13,
-                    "metadata": {},
-                    "output_type": "execute_result"
-                }
-            ],
-            "source": [
-                "chain_results.to_dataframe()"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "13aad317-73ff-46a7-a5a0-60b5b5295f02",
-            "metadata": {},
-            "source": [
-                "### (Optional) Compare to another prompt\n",
-                "\n",
-                "Now that we have our test run results, we can make changes to our agent and benchmark them. Let's try this again with a different prompt and see the results."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 14,
-            "id": "5eeb023f-ded2-4d0f-b910-2a57d9675853",
-            "metadata": {},
-            "outputs": [
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "View the evaluation results for project 'runnable-agent-test-39f3bbd0-1d7fb5e9' at:\n",
-                        "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/04b8a9ba-37a0-44e6-9e46-aa231be64c6d\n",
-                        "[------------------------------------------------->] 5/5\n",
-                        " Eval quantiles:\n",
-                        "                               0.25       0.5      0.75      mean      mode\n",
-                        "embedding_cosine_distance  0.067353  0.138474  0.243327  0.163574  0.050973\n",
-                        "correctness                0.000000  1.000000  1.000000  0.600000  1.000000\n",
-                        "helpfulness                1.000000  1.000000  1.000000  0.800000  1.000000\n",
-                        "fifth-grader-score         1.000000  1.000000  1.000000  1.000000  1.000000\n"
-                    ]
-                }
-            ],
-            "source": [
-                "candidate_prompt = hub.pull(\"wfh/langsmith-agent-prompt:39f3bbd0\")\n",
-                "\n",
-                "chain_results = run_on_dataset(\n",
-                "    dataset_name=dataset_name,\n",
-                "    llm_or_chain_factory=functools.partial(agent_factory, prompt=candidate_prompt),\n",
-                "    evaluation=evaluation_config,\n",
-                "    verbose=True,\n",
-                "    client=client,\n",
-                "    project_name=f\"runnable-agent-test-39f3bbd0-{unique_id}\",\n",
-                "    tags=[\"testing-notebook\", \"prompt:39f3bbd0\"],  # Optional, adds a tag to the resulting chain runs\n",
-                ")"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "591c819e-9932-45cf-adab-63727dd49559",
-            "metadata": {},
-            "source": [
-                "## Exporting datasets and runs\n",
-                "\n",
-                "LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run.\n",
-                "\n",
-                "**Note: It may be a few moments before all the runs are accessible.**"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 15,
-            "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 16,
-            "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "# After some time, these will be populated.\n",
-                "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
-            "metadata": {
-                "tags": []
-            },
-            "source": [
-                "## Conclusion\n",
-                "\n",
-                "Congratulations! You have succesfully traced and evaluated an agent using LangSmith!\n",
-                "\n",
-                "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better results.\n",
-                "\n",
-                "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.smith.langchain.com/), and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
-            ]
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.2"
-        }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# LangSmith Walkthrough\n",
+    "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/langsmith/walkthrough.ipynb)\n",
+    "\n",
+    "LangChain makes it easy to prototype LLM applications and Agents. However, delivering LLM applications to production can be deceptively difficult. You will likely have to heavily customize and iterate on your prompts, chains, and other components to create a high-quality product.\n",
+    "\n",
+    "To aid in this process, we've launched LangSmith, a unified platform for debugging, testing, and monitoring your LLM applications.\n",
+    "\n",
+    "When might this come in handy? You may find it useful when you want to:\n",
+    "\n",
+    "- Quickly debug a new chain, agent, or set of tools\n",
+    "- Visualize how components (chains, llms, retrievers, etc.) relate and are used\n",
+    "- Evaluate different prompts and LLMs for a single component\n",
+    "- Run a given chain several times over a dataset to ensure it consistently meets a quality bar\n",
+    "- Capture usage traces and using LLMs or analytics pipelines to generate insights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "138fbb8f-960d-4d26-9dd5-6d6acab3ee55",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "**[Create a LangSmith account](https://smith.langchain.com/) and create an API key (see bottom left corner). Familiarize yourself with the platform by looking through the [docs](https://docs.smith.langchain.com/)**\n",
+    "\n",
+    "Note LangSmith is in closed beta; we're in the process of rolling it out to more users. However, you can fill out the form on the website for expedited access.\n",
+    "\n",
+    "Now, let's get started!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d77d064-41b4-41fb-82e6-2d16461269ec",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Log runs to LangSmith\n",
+    "\n",
+    "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
+    "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable (if this isn't set, runs will be logged to the `default` project). This will automatically create the project for you if it doesn't exist. You must also set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables.\n",
+    "\n",
+    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.smith.langchain.com/docs/).\n",
+    "\n",
+    "**NOTE:** You must also set your `OPENAI_API_KEY` environment variables in order to run the following tutorial.\n",
+    "\n",
+    "**NOTE:** You can only access an API key when you first create it. Keep it somewhere safe.\n",
+    "\n",
+    "**NOTE:** You can also use a context manager in python to log traces using\n",
+    "```python\n",
+    "from langchain.callbacks.manager import tracing_v2_enabled\n",
+    "\n",
+    "with tracing_v2_enabled(project_name=\"My Project\"):\n",
+    "    agent.run(\"How many people live in canada as of 2023?\")\n",
+    "```\n",
+    "\n",
+    "However, in this example, we will use environment variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e4780363-f05a-4649-8b1a-9b449f960ce4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -U langchain langsmith langchainhub --quiet\n",
+    "%pip install openai tiktoken pandas duckduckgo-search --quiet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from uuid import uuid4\n",
+    "\n",
+    "unique_id = uuid4().hex[0:8]\n",
+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
+    "os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-API-KEY>\"  # Update to your API key\n",
+    "\n",
+    "# Used by the agent in this tutorial\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8ee7f34b-b65c-4e09-ad52-e3ace78d0221",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "Create the langsmith client to interact with the API"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "510b5ca0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langsmith import Client\n",
+    "\n",
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
+   "metadata": {},
+   "source": [
+    "Create a LangChain component and log runs to the platform. In this example, we will create a ReAct-style agent with access to a general search tool (DuckDuckGo). The agent's prompt can be viewed in the [Hub here](https://smith.langchain.com/hub/wfh/langsmith-agent-prompt)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a0fbfbba-3c82-4298-a312-9cec016d9d2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "from langchain.agents import AgentExecutor\n",
+    "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
+    "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.tools import DuckDuckGoSearchResults\n",
+    "from langchain.tools.render import format_tool_to_openai_function\n",
+    "\n",
+    "# Fetches the latest version of this prompt\n",
+    "prompt = hub.pull(\"wfh/langsmith-agent-prompt:latest\")\n",
+    "\n",
+    "llm = ChatOpenAI(\n",
+    "    model=\"gpt-3.5-turbo-16k\",\n",
+    "    temperature=0,\n",
+    ")\n",
+    "\n",
+    "tools = [\n",
+    "    DuckDuckGoSearchResults(\n",
+    "        name=\"duck_duck_go\"\n",
+    "    ),  # General internet search using DuckDuckGo\n",
+    "]\n",
+    "\n",
+    "llm_with_tools = llm.bind(functions=[format_tool_to_openai_function(t) for t in tools])\n",
+    "\n",
+    "runnable_agent = (\n",
+    "    {\n",
+    "        \"input\": lambda x: x[\"input\"],\n",
+    "        \"agent_scratchpad\": lambda x: format_to_openai_functions(\n",
+    "            x[\"intermediate_steps\"]\n",
+    "        ),\n",
+    "    }\n",
+    "    | prompt\n",
+    "    | llm_with_tools\n",
+    "    | OpenAIFunctionsAgentOutputParser()\n",
+    ")\n",
+    "\n",
+    "agent_executor = AgentExecutor(\n",
+    "    agent=runnable_agent, tools=tools, handle_parsing_errors=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cab51e1e-8270-452c-ba22-22b5b5951899",
+   "metadata": {},
+   "source": [
+    "We are running the agent concurrently on multiple inputs to reduce latency. Runs get logged to LangSmith in the background so execution latency is unaffected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "19537902-b95c-4390-80a4-f6c9a937081e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "inputs = [\n",
+    "    \"What is LangChain?\",\n",
+    "    \"What's LangSmith?\",\n",
+    "    \"When was Llama-v2 released?\",\n",
+    "    \"Who trained Llama-v2?\",\n",
+    "    \"What is the langsmith cookbook?\",\n",
+    "    \"When did langchain first announce the hub?\",\n",
+    "]\n",
+    "\n",
+    "results = agent_executor.batch([{\"input\": x} for x in inputs], return_exceptions=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "9a6a764c-5d7a-4de7-a916-3ecc987d5bb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'input': 'What is LangChain?',\n",
+       "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangChain\". Could you please provide more context or clarify your question?'},\n",
+       " {'input': \"What's LangSmith?\",\n",
+       "  'output': 'I\\'m sorry, but I couldn\\'t find any information about \"LangSmith\". It could be a specific term or a company that is not widely known. Can you provide more context or clarify what you are referring to?'}]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results[:2]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
+   "metadata": {},
+   "source": [
+    "Assuming you've successfully set up your environment, your agent traces should show up in the `Projects` section in the [app](https://smith.langchain.com/). Congrats!\n",
+    "\n",
+    "![Initial Runs](./img/log_traces.png)\n",
+    "\n",
+    "It looks like the agent isn't effectively using the tools though. Let's evaluate this so we have a baseline."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
+   "metadata": {},
+   "source": [
+    "## Evaluate Agent\n",
+    "\n",
+    "In addition to logging runs, LangSmith also allows you to test and evaluate your LLM applications.\n",
+    "\n",
+    "In this section, you will leverage LangSmith to create a benchmark dataset and run AI-assisted evaluators on an agent. You will do so in a few steps:\n",
+    "\n",
+    "1. Create a dataset\n",
+    "2. Initialize a new agent to benchmark\n",
+    "3. Configure evaluators to grade an agent's output\n",
+    "4. Run the agent over the dataset and evaluate the results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beab1a29-b79d-4a99-b5b1-0870c2d772b1",
+   "metadata": {},
+   "source": [
+    "### 1. Create a LangSmith dataset\n",
+    "\n",
+    "Below, we use the LangSmith client to create a dataset from the input questions from above and a list labels. You will use these later to measure performance for a new agent. A dataset is a collection of examples, which are nothing more than input-output pairs you can use as test cases to your application.\n",
+    "\n",
+    "For more information on datasets, including how to create them from CSVs or other files or how to create them in the platform, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "43fd40b2-3f02-4e51-9343-705aafe90a36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = [\n",
+    "    \"LangChain is an open-source framework for building applications using large language models. It is also the name of the company building LangSmith.\",\n",
+    "    \"LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain\",\n",
+    "    \"July 18, 2023\",\n",
+    "    \"The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor large language model-powered applications.\",\n",
+    "    \"September 5, 2023\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset_name = f\"agent-qa-{unique_id}\"\n",
+    "\n",
+    "dataset = client.create_dataset(\n",
+    "    dataset_name, description=\"An example dataset of questions over the LangSmith documentation.\"\n",
+    ")\n",
+    "\n",
+    "for query, answer in zip(inputs, outputs):\n",
+    "    client.create_example(inputs={\"input\": query}, outputs={\"output\": answer}, dataset_id=dataset.id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### 2. Initialize a new agent to benchmark\n",
+    "\n",
+    "LangSmith lets you evaluate any LLM, chain, agent, or even a custom function. Conversational agents are stateful (they have memory); to ensure that this state isn't shared between dataset runs, we will pass in a `chain_factory` (aka a `constructor`) function to initialize for each call.\n",
+    "\n",
+    "In this case, we will test an agent that uses OpenAI's function calling endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.agents import AgentType, initialize_agent, load_tools, AgentExecutor\n",
+    "from langchain.agents.format_scratchpad import format_to_openai_functions\n",
+    "from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser\n",
+    "from langchain.tools.render import format_tool_to_openai_function\n",
+    "from langchain import hub\n",
+    "\n",
+    "\n",
+    "# Since chains can be stateful (e.g. they can have memory), we provide\n",
+    "# a way to initialize a new chain for each row in the dataset. This is done\n",
+    "# by passing in a factory function that returns a new chain for each row.\n",
+    "def agent_factory(prompt):    \n",
+    "    llm_with_tools = llm.bind(\n",
+    "        functions=[format_tool_to_openai_function(t) for t in tools]\n",
+    "    )\n",
+    "    runnable_agent = (\n",
+    "            {\n",
+    "                \"input\": lambda x: x[\"input\"],\n",
+    "                \"agent_scratchpad\": lambda x: format_to_openai_functions(x['intermediate_steps'])\n",
+    "            } \n",
+    "             | prompt \n",
+    "             | llm_with_tools \n",
+    "             | OpenAIFunctionsAgentOutputParser()\n",
+    "    )\n",
+    "    return  AgentExecutor(agent=runnable_agent, tools=tools, handle_parsing_errors=True)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9cb9ef53",
+   "metadata": {},
+   "source": [
+    "### 3. Configure evaluation\n",
+    "\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It can be helpful to use automated metrics and AI-assisted feedback to evaluate your component's performance.\n",
+    "\n",
+    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "- Compare results against ground truth labels.\n",
+    "- Measure semantic (dis)similarity using embedding distance\n",
+    "- Evaluate 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
+    "\n",
+    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a25dc281",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation import EvaluatorType\n",
+    "from langchain.smith import RunEvalConfig\n",
+    "\n",
+    "evaluation_config = RunEvalConfig(\n",
+    "    # Evaluators can either be an evaluator type (e.g., \"qa\", \"criteria\", \"embedding_distance\", etc.) or a configuration for that evaluator\n",
+    "    evaluators=[\n",
+    "        # Measures whether a QA response is \"Correct\", based on a reference answer\n",
+    "        # You can also select via the raw string \"qa\"\n",
+    "        EvaluatorType.QA,\n",
+    "        # Measure the embedding distance between the output and the reference answer\n",
+    "        # Equivalent to: EvalConfig.EmbeddingDistance(embeddings=OpenAIEmbeddings())\n",
+    "        EvaluatorType.EMBEDDING_DISTANCE,\n",
+    "        # Grade whether the output satisfies the stated criteria.\n",
+    "        # You can select a default one such as \"helpfulness\" or provide your own.\n",
+    "        RunEvalConfig.LabeledCriteria(\"helpfulness\"),\n",
+    "        # The LabeledScoreString evaluator outputs a score on a scale from 1-10.\n",
+    "        # You can use defalut criteria or write our own rubric\n",
+    "        RunEvalConfig.LabeledScoreString(\n",
+    "            {\n",
+    "                 \"accuracy\": \"\"\"\n",
+    "Score 1: The answer is completely unrelated to the reference.\n",
+    "Score 3: The answer has minor relevance but does not align with the reference.\n",
+    "Score 5: The answer has moderate relevance but contains inaccuracies.\n",
+    "Score 7: The answer aligns with the reference but has minor errors or omissions.\n",
+    "Score 10: The answer is completely accurate and aligns perfectly with the reference.\"\"\"\n",
+    "            },\n",
+    "            normalize_by=10,\n",
+    "        ),\n",
+    "    ],\n",
+    "    # You can add custom StringEvaluator or RunEvaluator objects here as well, which will automatically be\n",
+    "    # applied to each prediction. Check out the docs for examples.\n",
+    "    custom_evaluators=[],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07885b10",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### 4. Run the agent and evaluators\n",
+    "\n",
+    "Use the [run_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.run_on_dataset.html#langchain.smith.evaluation.runner_utils.run_on_dataset) (or asynchronous [arun_on_dataset](https://api.python.langchain.com/en/latest/smith/langchain.smith.evaluation.runner_utils.arun_on_dataset.html#langchain.smith.evaluation.runner_utils.arun_on_dataset)) function to evaluate your model. This will:\n",
+    "1. Fetch example rows from the specified dataset.\n",
+    "2. Run your agent (or any custom function) on each example.\n",
+    "3. Apply evalutors to the resulting run traces and corresponding reference examples to generate automated feedback.\n",
+    "\n",
+    "The results will be visible in the LangSmith app."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "af8c8469-d70d-46d9-8fcd-517a1ccc7c4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain import hub\n",
+    "\n",
+    "# We will test this version of the prompt\n",
+    "prompt = hub.pull(\"wfh/langsmith-agent-prompt:798e7324\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'runnable-agent-test-5d466cbc-bf2162aa' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/0c3d22fa-f8b0-4608-b086-2187c18361a5\n",
+      "[>                                                 ] 0/5"
+     ]
     },
-    "nbformat": 4,
-    "nbformat_minor": 5
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Chain failed for example 54b4fce8-4492-409d-94af-708f51698b39 with inputs {'input': 'Who trained Llama-v2?'}\n",
+      "Error Type: TypeError, Message: DuckDuckGoSearchResults._run() got an unexpected keyword argument 'arg1'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[------------------------------------------------->] 5/5\n",
+      " Eval quantiles:\n",
+      "                               0.25       0.5      0.75      mean      mode\n",
+      "embedding_cosine_distance  0.086614  0.118841  0.183672  0.151444  0.050158\n",
+      "correctness                0.000000  0.500000  1.000000  0.500000  0.000000\n",
+      "score_string:accuracy      0.775000  1.000000  1.000000  0.775000  1.000000\n",
+      "helpfulness                0.750000  1.000000  1.000000  0.750000  1.000000\n"
+     ]
+    }
+   ],
+   "source": [
+    "import functools\n",
+    "from langchain.smith import (\n",
+    "    arun_on_dataset,\n",
+    "    run_on_dataset, \n",
+    ")\n",
+    "\n",
+    "chain_results = run_on_dataset(\n",
+    "    dataset_name=dataset_name,\n",
+    "    llm_or_chain_factory=functools.partial(agent_factory, prompt=prompt),\n",
+    "    evaluation=evaluation_config,\n",
+    "    verbose=True,\n",
+    "    client=client,\n",
+    "    project_name=f\"runnable-agent-test-5d466cbc-{unique_id}\",\n",
+    "    tags=[\"testing-notebook\", \"prompt:5d466cbc\"],  # Optional, adds a tag to the resulting chain runs\n",
+    ")\n",
+    "\n",
+    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
+    "# These are logged as warnings here and captured as errors in the tracing UI."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Review the test results\n",
+    "\n",
+    "You can review the test results tracing UI below by clicking the URL in the output above or navigating to the \"Testing & Datasets\" page in LangSmith  **\"agent-qa-{unique_id}\"** dataset. \n",
+    "\n",
+    "![test results](./img/test_results.png)\n",
+    "\n",
+    "This will show the new runs and the feedback logged from the selected evaluators. You can also explore a summary of the results in tabular format below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "9da60638-5be8-4b5f-a721-2c6627aeaf0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>embedding_cosine_distance</th>\n",
+       "      <th>correctness</th>\n",
+       "      <th>score_string:accuracy</th>\n",
+       "      <th>helpfulness</th>\n",
+       "      <th>input</th>\n",
+       "      <th>output</th>\n",
+       "      <th>reference</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>42b639a2-17c4-4031-88a9-0ce2c45781ce</th>\n",
+       "      <td>0.317938</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>{'input': 'What is the langsmith cookbook?'}</td>\n",
+       "      <td>{'input': 'What is the langsmith cookbook?', '...</td>\n",
+       "      <td>{'output': 'September 5, 2023'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>54b4fce8-4492-409d-94af-708f51698b39</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>{'input': 'Who trained Llama-v2?'}</td>\n",
+       "      <td>{'Error': 'TypeError(\"DuckDuckGoSearchResults....</td>\n",
+       "      <td>{'output': 'The langsmith cookbook is a github...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e</th>\n",
+       "      <td>0.138916</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>{'input': 'When was Llama-v2 released?'}</td>\n",
+       "      <td>{'input': 'When was Llama-v2 released?', 'outp...</td>\n",
+       "      <td>{'output': 'July 18, 2023'}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>678c0363-3ed1-410a-811f-ebadef2e783a</th>\n",
+       "      <td>0.050158</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>{'input': 'What's LangSmith?'}</td>\n",
+       "      <td>{'input': 'What's LangSmith?', 'output': 'Lang...</td>\n",
+       "      <td>{'output': 'LangSmith is a unified platform fo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>762a616c-7aab-419c-9001-b43ab6200d26</th>\n",
+       "      <td>0.098766</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.1</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>{'input': 'What is LangChain?'}</td>\n",
+       "      <td>{'input': 'What is LangChain?', 'output': 'Lan...</td>\n",
+       "      <td>{'output': 'LangChain is an open-source framew...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      embedding_cosine_distance  correctness  \\\n",
+       "42b639a2-17c4-4031-88a9-0ce2c45781ce                   0.317938          0.0   \n",
+       "54b4fce8-4492-409d-94af-708f51698b39                        NaN          NaN   \n",
+       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                   0.138916          1.0   \n",
+       "678c0363-3ed1-410a-811f-ebadef2e783a                   0.050158          1.0   \n",
+       "762a616c-7aab-419c-9001-b43ab6200d26                   0.098766          0.0   \n",
+       "\n",
+       "                                      score_string:accuracy  helpfulness  \\\n",
+       "42b639a2-17c4-4031-88a9-0ce2c45781ce                    1.0          1.0   \n",
+       "54b4fce8-4492-409d-94af-708f51698b39                    NaN          NaN   \n",
+       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                    1.0          1.0   \n",
+       "678c0363-3ed1-410a-811f-ebadef2e783a                    1.0          1.0   \n",
+       "762a616c-7aab-419c-9001-b43ab6200d26                    0.1          0.0   \n",
+       "\n",
+       "                                                                             input  \\\n",
+       "42b639a2-17c4-4031-88a9-0ce2c45781ce  {'input': 'What is the langsmith cookbook?'}   \n",
+       "54b4fce8-4492-409d-94af-708f51698b39            {'input': 'Who trained Llama-v2?'}   \n",
+       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e      {'input': 'When was Llama-v2 released?'}   \n",
+       "678c0363-3ed1-410a-811f-ebadef2e783a                {'input': 'What's LangSmith?'}   \n",
+       "762a616c-7aab-419c-9001-b43ab6200d26               {'input': 'What is LangChain?'}   \n",
+       "\n",
+       "                                                                                 output  \\\n",
+       "42b639a2-17c4-4031-88a9-0ce2c45781ce  {'input': 'What is the langsmith cookbook?', '...   \n",
+       "54b4fce8-4492-409d-94af-708f51698b39  {'Error': 'TypeError(\"DuckDuckGoSearchResults....   \n",
+       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e  {'input': 'When was Llama-v2 released?', 'outp...   \n",
+       "678c0363-3ed1-410a-811f-ebadef2e783a  {'input': 'What's LangSmith?', 'output': 'Lang...   \n",
+       "762a616c-7aab-419c-9001-b43ab6200d26  {'input': 'What is LangChain?', 'output': 'Lan...   \n",
+       "\n",
+       "                                                                              reference  \n",
+       "42b639a2-17c4-4031-88a9-0ce2c45781ce                    {'output': 'September 5, 2023'}  \n",
+       "54b4fce8-4492-409d-94af-708f51698b39  {'output': 'The langsmith cookbook is a github...  \n",
+       "8ae5104e-bbb4-42cc-a84e-f9b8cfc92b8e                        {'output': 'July 18, 2023'}  \n",
+       "678c0363-3ed1-410a-811f-ebadef2e783a  {'output': 'LangSmith is a unified platform fo...  \n",
+       "762a616c-7aab-419c-9001-b43ab6200d26  {'output': 'LangChain is an open-source framew...  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain_results.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13aad317-73ff-46a7-a5a0-60b5b5295f02",
+   "metadata": {},
+   "source": [
+    "### (Optional) Compare to another prompt\n",
+    "\n",
+    "Now that we have our test run results, we can make changes to our agent and benchmark them. Let's try this again with a different prompt and see the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "5eeb023f-ded2-4d0f-b910-2a57d9675853",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "View the evaluation results for project 'runnable-agent-test-39f3bbd0-bf2162aa' at:\n",
+      "https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/projects/p/fa721ccc-dd0f-41c9-bf80-22215c44efd4\n",
+      "[------------------------------------------------->] 5/5\n",
+      " Eval quantiles:\n",
+      "                               0.25       0.5      0.75      mean      mode\n",
+      "embedding_cosine_distance  0.059506  0.155538  0.212864  0.157915  0.043119\n",
+      "correctness                0.000000  0.000000  1.000000  0.400000  0.000000\n",
+      "score_string:accuracy      0.700000  1.000000  1.000000  0.880000  1.000000\n",
+      "helpfulness                1.000000  1.000000  1.000000  0.800000  1.000000\n"
+     ]
+    }
+   ],
+   "source": [
+    "candidate_prompt = hub.pull(\"wfh/langsmith-agent-prompt:39f3bbd0\")\n",
+    "\n",
+    "chain_results = run_on_dataset(\n",
+    "    dataset_name=dataset_name,\n",
+    "    llm_or_chain_factory=functools.partial(agent_factory, prompt=candidate_prompt),\n",
+    "    evaluation=evaluation_config,\n",
+    "    verbose=True,\n",
+    "    client=client,\n",
+    "    project_name=f\"runnable-agent-test-39f3bbd0-{unique_id}\",\n",
+    "    tags=[\"testing-notebook\", \"prompt:39f3bbd0\"],  # Optional, adds a tag to the resulting chain runs\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "591c819e-9932-45cf-adab-63727dd49559",
+   "metadata": {},
+   "source": [
+    "## Exporting datasets and runs\n",
+    "\n",
+    "LangSmith lets you export data to common formats such as CSV or JSONL directly in the web app. You can also use the client to fetch runs for further analysis, to store in your own database, or to share with others. Let's fetch the run traces from the evaluation run.\n",
+    "\n",
+    "**Note: It may be a few moments before all the runs are accessible.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "33bfefde-d1bb-4f50-9f7a-fd572ee76820",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6595c888-1f5c-4ae3-9390-0a559f5575d1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# After some time, these will be populated.\n",
+    "client.read_project(project_name=chain_results[\"project_name\"]).feedback_stats"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "Congratulations! You have succesfully traced and evaluated an agent using LangSmith!\n",
+    "\n",
+    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better results.\n",
+    "\n",
+    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.smith.langchain.com/), and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/libs/langchain/langchain/evaluation/__init__.py b/libs/langchain/langchain/evaluation/__init__.py
index e70330be17..a2143cada3 100644
--- a/libs/langchain/langchain/evaluation/__init__.py
+++ b/libs/langchain/langchain/evaluation/__init__.py
@@ -77,6 +77,10 @@ from langchain.evaluation.schema import (
     PairwiseStringEvaluator,
     StringEvaluator,
 )
+from langchain.evaluation.scoring import (
+    LabeledScoreStringEvalChain,
+    ScoreStringEvalChain,
+)
 from langchain.evaluation.string_distance import (
     PairwiseStringDistanceEvalChain,
     StringDistance,
@@ -108,4 +112,6 @@ __all__ = [
     "load_evaluator",
     "load_dataset",
     "AgentTrajectoryEvaluator",
+    "ScoreStringEvalChain",
+    "LabeledScoreStringEvalChain",
 ]
diff --git a/libs/langchain/langchain/evaluation/scoring/eval_chain.py b/libs/langchain/langchain/evaluation/scoring/eval_chain.py
index 28ba5deac7..bcd37d66e7 100644
--- a/libs/langchain/langchain/evaluation/scoring/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/scoring/eval_chain.py
@@ -173,6 +173,10 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
     output_parser: BaseOutputParser = Field(
         default_factory=ScoreStringResultOutputParser
     )
+    normalize_by: Optional[float] = None
+    """The value to normalize the score by, if specified."""
+    criterion_name: str
+    """The name of the criterion being evaluated."""
 
     class Config:
         """Configuration for the ScoreStringEvalChain."""
@@ -199,6 +203,17 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
         """
         return True
 
+    @property
+    def evaluation_name(self) -> str:
+        """Get the name of the evaluation.
+
+        Returns
+        -------
+        str
+            The name of the evaluation.
+        """
+        return f"score_string:{self.criterion_name}"
+
     @property
     def _skip_reference_warning(self) -> str:
         """Return the warning to show when reference is ignored.
@@ -220,6 +235,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
         *,
         prompt: Optional[PromptTemplate] = None,
         criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
+        normalize_by: Optional[float] = None,
         **kwargs: Any,
     ) -> ScoreStringEvalChain:
         """Initialize the ScoreStringEvalChain from an LLM.
@@ -230,7 +246,7 @@ class ScoreStringEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
             **kwargs (Any): Additional keyword arguments.
 
         Returns:
-            PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
+            ScoreStringEvalChain: The initialized ScoreStringEvalChain.
 
         Raises:
             ValueError: If the input variables are not as expected.
@@ -253,11 +269,21 @@ Performance may be significantly worse with other models."
                 f"but got {prompt_.input_variables}"
             )
         criteria_ = resolve_criteria(criteria)
-        criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items())
+        criteria_str = "\n".join(
+            f"{k}: {v}" if v else k for k, v in criteria_.items()
+        ).strip()
         criteria_str = (
-            CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else DEFAULT_CRITERIA
+            CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
+            if criteria_str
+            else DEFAULT_CRITERIA
+        )
+        return cls(
+            llm=llm,
+            prompt=prompt_.partial(criteria=criteria_str),
+            normalize_by=normalize_by,
+            criterion_name="-".join(criteria_),
+            **kwargs,
         )
-        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
 
     def _prepare_input(
         self,
@@ -290,6 +316,8 @@ Performance may be significantly worse with other models."
         parsed = result[self.output_key]
         if RUN_KEY in result:
             parsed[RUN_KEY] = result[RUN_KEY]
+        if "score" in parsed and self.normalize_by is not None:
+            parsed["score"] = parsed["score"] / self.normalize_by
         return parsed
 
     def _evaluate_strings(
@@ -392,6 +420,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
         *,
         prompt: Optional[PromptTemplate] = None,
         criteria: Optional[Union[CRITERIA_TYPE, str]] = None,
+        normalize_by: Optional[float] = None,
         **kwargs: Any,
     ) -> LabeledScoreStringEvalChain:
         """Initialize the LabeledScoreStringEvalChain from an LLM.
@@ -400,6 +429,7 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
             llm (BaseLanguageModel): The LLM to use.
             prompt (PromptTemplate, optional): The prompt to use.
             criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
+            normalize_by (float, optional): The value to normalize the score by.
             **kwargs (Any): Additional keyword arguments.
 
         Returns:
@@ -422,6 +452,16 @@ class LabeledScoreStringEvalChain(ScoreStringEvalChain):
                 f"but got {prompt_.input_variables}"
             )
         criteria_ = resolve_criteria(criteria)
-        criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items())
-        criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else ""
-        return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs)
+        criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items()).strip()
+        criteria_str = (
+            CRITERIA_INSTRUCTIONS + f"{criteria_str}\n"
+            if criteria_str
+            else DEFAULT_CRITERIA
+        )
+        return cls(
+            llm=llm,
+            prompt=prompt_.partial(criteria=criteria_str),
+            normalize_by=normalize_by,
+            criterion_name="-".join(criteria_),
+            **kwargs,
+        )
diff --git a/libs/langchain/langchain/evaluation/scoring/prompt.py b/libs/langchain/langchain/evaluation/scoring/prompt.py
index 10a6536254..1d25055834 100644
--- a/libs/langchain/langchain/evaluation/scoring/prompt.py
+++ b/libs/langchain/langchain/evaluation/scoring/prompt.py
@@ -39,9 +39,10 @@ SCORING_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages(
         ("system", SYSTEM_MESSAGE),
         (
             "human",
-            '[Instruction]\nPlease act as an impartial judge \
+            "[Instruction]\nPlease act as an impartial judge \
 and evaluate the quality of the response provided by an AI \
-assistant to the user question displayed below. {criteria}{reference}Begin your evaluation \
+assistant to the user question displayed below. {criteria}"
+            '[Ground truth]\n{reference}\nBegin your evaluation \
 by providing a short explanation. Be as objective as possible. \
 After providing your explanation, you must rate the response on a scale of 1 to 10 \
 by strictly following this format: "[[rating]]", for example: "Rating: [[5]]".\n\n\
diff --git a/libs/langchain/langchain/smith/evaluation/config.py b/libs/langchain/langchain/smith/evaluation/config.py
index 7674c6734f..cdbcc15de9 100644
--- a/libs/langchain/langchain/smith/evaluation/config.py
+++ b/libs/langchain/langchain/smith/evaluation/config.py
@@ -291,4 +291,40 @@ class RunEvalConfig(BaseModel):
         evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
         flags: int = 0
 
-    # TODO: Trajectory
+    class ScoreString(EvalConfig):
+        """Configuration for a score string evaluator.
+        This is like the criteria evaluator but it is configured by
+        default to return a score on the scale from 1-10.
+
+        It is recommended to normalize these scores
+        by setting `normalize_by` to 10.
+
+        Parameters
+        ----------
+        criteria : Optional[CRITERIA_TYPE]
+            The criteria to evaluate.
+        llm : Optional[BaseLanguageModel]
+            The language model to use for the evaluation chain.
+        normalize_by: Optional[int] = None
+            If you want to normalize the score, the denominator to use.
+            If not provided, the score will be between 1 and 10 (by default).
+        prompt : Optional[BasePromptTemplate]
+
+        """
+
+        evaluator_type: EvaluatorType = EvaluatorType.SCORE_STRING
+        criteria: Optional[CRITERIA_TYPE] = None
+        llm: Optional[BaseLanguageModel] = None
+        normalize_by: Optional[float] = None
+        prompt: Optional[BasePromptTemplate] = None
+
+        def __init__(
+            self,
+            criteria: Optional[CRITERIA_TYPE] = None,
+            normalize_by: Optional[float] = None,
+            **kwargs: Any
+        ) -> None:
+            super().__init__(criteria=criteria, normalize_by=normalize_by, **kwargs)
+
+    class LabeledScoreString(ScoreString):
+        evaluator_type: EvaluatorType = EvaluatorType.LABELED_SCORE_STRING