Add Feedback Methods + Evaluation examples (#5166)

Add CRUD methods to interact with feedback endpoints + added eval examples to the notebook
12 months ago · ea09c0846f
parent 46b7181f13
commit ea09c0846f
7 changed files with 453 additions and 547 deletions
--- a/langchain/client/langchain.py
+++ b/langchain/client/langchain.py
@ -11,7 +11,9 @@ from typing import (
    Dict,
    Iterator,
    List,
+    Mapping,
    Optional,
+    Sequence,
    Tuple,
    Union,
 )
@ -27,11 +29,19 @@ from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.tracers.schemas import Run, TracerSession
 from langchain.chains.base import Chain
 from langchain.client.models import (
+    APIFeedbackSource,
    Dataset,
    DatasetCreate,
    Example,
    ExampleCreate,
+    ExampleUpdate,
+    Feedback,
+    FeedbackCreate,
+    FeedbackSourceBase,
+    FeedbackSourceType,
+    ListFeedbackQueryParams,
    ListRunsQueryParams,
+    ModelFeedbackSource,
 )
 from langchain.client.runner_utils import arun_on_examples, run_on_examples
 from langchain.utils import raise_for_status_with_text, xor_args
@ -158,8 +168,8 @@ class LangChainPlusClient(BaseSettings):
        df: pd.DataFrame,
        name: str,
        description: str,
-        input_keys: List[str],
-        output_keys: List[str],
+        input_keys: Sequence[str],
+        output_keys: Sequence[str],
    ) -> Dataset:
        """Upload a dataframe as individual examples to the LangChain+ API."""
        dataset = self.create_dataset(dataset_name=name, description=description)
@ -173,8 +183,8 @@ class LangChainPlusClient(BaseSettings):
        self,
        csv_file: Union[str, Tuple[str, BytesIO]],
        description: str,
-        input_keys: List[str],
-        output_keys: List[str],
+        input_keys: Sequence[str],
+        output_keys: Sequence[str],
    ) -> Dataset:
        """Upload a CSV file to the LangChain+ API."""
        files = {"file": csv_file}
@ -223,10 +233,7 @@ class LangChainPlusClient(BaseSettings):
        query_params = ListRunsQueryParams(
            session_id=session_id, run_type=run_type, **kwargs
        )
-        filtered_params = {
-            k: v for k, v in query_params.dict().items() if v is not None
-        }
-        response = self._get("/runs", params=filtered_params)
+        response = self._get("/runs", params=query_params.dict(exclude_none=True))
        raise_for_status_with_text(response)
        yield from [Run(**run) for run in response.json()]

@ -279,7 +286,9 @@ class LangChainPlusClient(BaseSettings):
        raise_for_status_with_text(response)
        return None

-    def create_dataset(self, dataset_name: str, description: str) -> Dataset:
+    def create_dataset(
+        self, dataset_name: str, *, description: Optional[str] = None
+    ) -> Dataset:
        """Create a dataset in the LangChain+ API."""
        dataset = DatasetCreate(
            tenant_id=self.tenant_id,
@ -394,6 +403,110 @@ class LangChainPlusClient(BaseSettings):
        raise_for_status_with_text(response)
        yield from [Example(**dataset) for dataset in response.json()]

+    def update_example(
+        self,
+        example_id: str,
+        *,
+        inputs: Optional[Dict[str, Any]] = None,
+        outputs: Optional[Mapping[str, Any]] = None,
+        dataset_id: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Update a specific example."""
+        example = ExampleUpdate(
+            inputs=inputs,
+            outputs=outputs,
+            dataset_id=dataset_id,
+        )
+        response = requests.patch(
+            f"{self.api_url}/examples/{example_id}",
+            headers=self._headers,
+            data=example.json(exclude_none=True),
+        )
+        raise_for_status_with_text(response)
+        return response.json()
+
+    def create_feedback(
+        self,
+        run_id: str,
+        key: str,
+        *,
+        score: Union[float, int, bool, None] = None,
+        value: Union[float, int, bool, str, dict, None] = None,
+        correction: Union[str, dict, None] = None,
+        comment: Union[str, None] = None,
+        source_info: Optional[Dict[str, Any]] = None,
+        feedback_source_type: FeedbackSourceType = FeedbackSourceType.API,
+    ) -> Feedback:
+        """Create a feedback in the LangChain+ API.
+
+        Args:
+            run_id: The ID of the run to provide feedback on.
+            key: The name of the metric, tag, or 'aspect' this
+                feedback is about.
+            score: The score to rate this run on the metric
+                or aspect.
+            value: The display value or non-numeric value for this feedback.
+            correction: The proper ground truth for this run.
+            comment: A comment about this feedback.
+            source_info: Information about the source of this feedback.
+            feedback_source_type: The type of feedback source.
+        """
+        if feedback_source_type == FeedbackSourceType.API:
+            feedback_source: FeedbackSourceBase = APIFeedbackSource(
+                metadata=source_info
+            )
+        elif feedback_source_type == FeedbackSourceType.MODEL:
+            feedback_source = ModelFeedbackSource(metadata=source_info)
+        else:
+            raise ValueError(f"Unknown feedback source type {feedback_source_type}")
+        feedback = FeedbackCreate(
+            run_id=run_id,
+            key=key,
+            score=score,
+            value=value,
+            correction=correction,
+            comment=comment,
+            feedback_source=feedback_source,
+        )
+        response = requests.post(
+            self.api_url + "/feedback",
+            headers=self._headers,
+            data=feedback.json(),
+        )
+        raise_for_status_with_text(response)
+        return Feedback(**feedback.dict())
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(0.5))
+    def read_feedback(self, feedback_id: str) -> Feedback:
+        """Read a feedback from the LangChain+ API."""
+        response = self._get(f"/feedback/{feedback_id}")
+        raise_for_status_with_text(response)
+        return Feedback(**response.json())
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(0.5))
+    def list_feedback(
+        self,
+        *,
+        run_ids: Optional[Sequence[Union[str, UUID]]] = None,
+        **kwargs: Any,
+    ) -> Iterator[Feedback]:
+        """List the feedback objects on the LangChain+ API."""
+        params = ListFeedbackQueryParams(
+            run=run_ids,
+            **kwargs,
+        )
+        response = self._get("/feedback", params=params.dict(exclude_none=True))
+        raise_for_status_with_text(response)
+        yield from [Feedback(**feedback) for feedback in response.json()]
+
+    def delete_feedback(self, feedback_id: str) -> None:
+        """Delete a feedback by ID."""
+        response = requests.delete(
+            f"{self.api_url}/feedback/{feedback_id}",
+            headers=self._headers,
+        )
+        raise_for_status_with_text(response)
+
    async def arun_on_dataset(
        self,
        dataset_name: str,
--- a/langchain/client/models.py
+++ b/langchain/client/models.py
@ -1,6 +1,7 @@
 from datetime import datetime
-from typing import Any, Dict, List, Optional
-from uuid import UUID
+from enum import Enum
+from typing import Any, ClassVar, Dict, List, Mapping, Optional, Sequence, Union
+from uuid import UUID, uuid4

 from pydantic import BaseModel, Field, root_validator

@ -14,6 +15,9 @@ class ExampleBase(BaseModel):
    inputs: Dict[str, Any]
    outputs: Optional[Dict[str, Any]] = Field(default=None)

+    class Config:
+        frozen = True
+

 class ExampleCreate(ExampleBase):
    """Example create model."""
@ -31,12 +35,26 @@ class Example(ExampleBase):
    runs: List[Run] = Field(default_factory=list)


+class ExampleUpdate(BaseModel):
+    """Update class for Example."""
+
+    dataset_id: Optional[UUID] = None
+    inputs: Optional[Dict[str, Any]] = None
+    outputs: Optional[Dict[str, Any]] = None
+
+    class Config:
+        frozen = True
+
+
 class DatasetBase(BaseModel):
    """Dataset base model."""

    tenant_id: UUID
    name: str
-    description: str
+    description: Optional[str] = None
+
+    class Config:
+        frozen = True


 class DatasetCreate(DatasetBase):
@ -57,9 +75,6 @@ class Dataset(DatasetBase):
 class ListRunsQueryParams(BaseModel):
    """Query params for GET /runs endpoint."""

-    class Config:
-        extra = "forbid"
-
    id: Optional[List[UUID]]
    """Filter runs by id."""
    parent_run: Optional[UUID]
@ -89,7 +104,11 @@ class ListRunsQueryParams(BaseModel):
        description="Query Runs that ended >= this time",
    )

-    @root_validator
+    class Config:
+        extra = "forbid"
+        frozen = True
+
+    @root_validator(allow_reuse=True)
    def validate_time_range(cls, values: Dict[str, Any]) -> Dict[str, Any]:
        """Validate that start_time <= end_time."""
        start_time = values.get("start_time")
@ -97,3 +116,91 @@ class ListRunsQueryParams(BaseModel):
        if start_time and end_time and start_time > end_time:
            raise ValueError("start_time must be <= end_time")
        return values
+
+
+class FeedbackSourceBase(BaseModel):
+    type: ClassVar[str]
+    metadata: Optional[Dict[str, Any]] = None
+
+    class Config:
+        frozen = True
+
+
+class APIFeedbackSource(FeedbackSourceBase):
+    """API feedback source."""
+
+    type: ClassVar[str] = "api"
+
+
+class ModelFeedbackSource(FeedbackSourceBase):
+    """Model feedback source."""
+
+    type: ClassVar[str] = "model"
+
+
+class FeedbackSourceType(Enum):
+    """Feedback source type."""
+
+    API = "api"
+    """General feedback submitted from the API."""
+    MODEL = "model"
+    """Model-assisted feedback."""
+
+
+class FeedbackBase(BaseModel):
+    """Feedback schema."""
+
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    """The time the feedback was created."""
+    modified_at: datetime = Field(default_factory=datetime.utcnow)
+    """The time the feedback was last modified."""
+    run_id: UUID
+    """The associated run ID this feedback is logged for."""
+    key: str
+    """The metric name, tag, or aspect to provide feedback on."""
+    score: Union[float, int, bool, None] = None
+    """Value or score to assign the run."""
+    value: Union[float, int, bool, str, dict, None] = None
+    """The display value, tag or other value for the feedback if not a metric."""
+    comment: Optional[str] = None
+    """Comment or explanation for the feedback."""
+    correction: Union[str, dict, None] = None
+    """Correction for the run."""
+    feedback_source: Optional[
+        Union[APIFeedbackSource, ModelFeedbackSource, Mapping[str, Any]]
+    ] = None
+    """The source of the feedback."""
+
+    class Config:
+        frozen = True
+
+
+class FeedbackCreate(FeedbackBase):
+    """Schema used for creating feedback."""
+
+    id: UUID = Field(default_factory=uuid4)
+
+    feedback_source: APIFeedbackSource
+    """The source of the feedback."""
+
+
+class Feedback(FeedbackBase):
+    """Schema for getting feedback."""
+
+    id: UUID
+    feedback_source: Optional[Dict] = None
+    """The source of the feedback. In this case"""
+
+
+class ListFeedbackQueryParams(BaseModel):
+    """Query Params for listing feedbacks."""
+
+    run: Optional[Sequence[UUID]] = None
+    limit: int = 100
+    offset: int = 0
+
+    class Config:
+        """Config for query params."""
+
+        extra = "forbid"
+        frozen = True
--- a/langchain/client/runner_utils.py
+++ b/langchain/client/runner_utils.py
@ -151,7 +151,7 @@ async def _arun_llm_or_chain(
                )
            else:
                chain = llm_or_chain_factory()
-                output = await chain.arun(example.inputs, callbacks=callbacks)
+                output = await chain.acall(example.inputs, callbacks=callbacks)
            outputs.append(output)
        except Exception as e:
            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
@ -326,7 +326,7 @@ def run_llm_or_chain(
                output: Any = run_llm(llm_or_chain_factory, example.inputs, callbacks)
            else:
                chain = llm_or_chain_factory()
-                output = chain.run(example.inputs, callbacks=callbacks)
+                output = chain(example.inputs, callbacks=callbacks)
            outputs.append(output)
        except Exception as e:
            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
--- a/langchain/client/utils.py
+++ b/langchain/client/utils.py
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@ -1,7 +1,7 @@
 """LLM Chain specifically for evaluating question answering."""
 from __future__ import annotations

-from typing import Any, List
+from typing import Any, List, Sequence

 from langchain import PromptTemplate
 from langchain.base_language import BaseLanguageModel
@ -41,8 +41,8 @@ class QAEvalChain(LLMChain):

    def evaluate(
        self,
-        examples: List[dict],
-        predictions: List[dict],
+        examples: Sequence[dict],
+        predictions: Sequence[dict],
        question_key: str = "query",
        answer_key: str = "answer",
        prediction_key: str = "result",
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@ -86,10 +86,10 @@
    {
     "data": {
      "text/html": [
-       "<a href=\"http://localhost\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: http://localhost:8000)"
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
      ]
     },
     "execution_count": 1,
@ -101,7 +101,6 @@
    "import os\n",
    "from langchain.client import LangChainPlusClient\n",
    "\n",
-    "import os\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
    "os.environ[\"LANGCHAIN_SESSION\"] = \"Tracing Walkthrough\"\n",
    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line if you want to use the hosted version\n",
@ -142,60 +141,59 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "39,566,248\n",
-      "Anwar Hadid is Dua Lipa's boyfriend and his age raised to the 0.43 power is approximately 3.87.\n",
-      "LLMMathChain._evaluate(\"\n",
-      "(age ** 0.43)\n",
-      "\") raised error: 'age'. Please try again with a valid numerical expression\n",
-      "The distance between Paris and Boston is 3448 miles.\n",
-      "The total number of points scored in the 2023 super bowl raised to the .23 power is approximately 3.457460415669602.\n",
-      "LLMMathChain._evaluate(\"\n",
-      "(total number of points scored in the 2023 super bowl)**0.23\n",
-      "\") raised error: invalid syntax. Perhaps you forgot a comma? (<expr>, line 1). Please try again with a valid numerical expression\n"
+      "unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.\n",
+      "unknown format from LLM: Sorry, as an AI language model, I do not have access to personal information such as someone's age. Please provide a different math problem.\n",
+      "unknown format from LLM: As an AI language model, I do not have information on future events such as the 2023 super bowl. Therefore, I cannot provide a solution to this question.\n",
+      "unknown format from LLM: This is not a math problem and cannot be translated into a mathematical expression.\n"
     ]
    },
    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 63c89b8bad9b172227d890620cdec651 in your message.).\n",
-      "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID e3dd37877de500d7defe699f8411b3dd in your message.).\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n",
-      "1.9347796717823205\n",
-      "1.2600907451828602 (inches)\n",
-      "LLMMathChain._evaluate(\"\n",
-      "round(0.2791714614499425, 2)\n",
-      "\") raised error: 'VariableNode' object is not callable. Please try again with a valid numerical expression\n"
-     ]
+     "data": {
+      "text/plain": [
+       "['The population of Canada as of 2023 is estimated to be 39,566,248.',\n",
+       " \"Anwar Hadid's age raised to the 0.43 power is approximately 3.87.\",\n",
+       " ValueError(\"unknown format from LLM: Sorry, as an AI language model, I do not have access to personal information such as someone's age. Please provide a different math problem.\"),\n",
+       " 'The distance between Paris and Boston is 3448 miles.',\n",
+       " ValueError('unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.'),\n",
+       " ValueError('unknown format from LLM: As an AI language model, I do not have information on future events such as the 2023 super bowl. Therefore, I cannot provide a solution to this question.'),\n",
+       " '15 points were scored more in the 2023 Super Bowl than in the 2022 Super Bowl.',\n",
+       " '1.9347796717823205',\n",
+       " ValueError('unknown format from LLM: This is not a math problem and cannot be translated into a mathematical expression.'),\n",
+       " '0.2791714614499425']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
+    "import asyncio\n",
+    "\n",
    "inputs = [\n",
-    "'How many people live in canada as of 2023?',\n",
-    " \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
-    " \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
-    " 'how far is it from paris to boston in miles',\n",
-    " 'what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?',\n",
-    " 'what was the total number of points scored in the 2023 super bowl raised to the .23 power?',\n",
-    " 'how many more points were scored in the 2023 super bowl than in the 2022 super bowl?',\n",
-    " 'what is 153 raised to .1312 power?',\n",
-    " \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
-    " 'what is 1213 divided by 4345?'\n",
+    "    \"How many people live in canada as of 2023?\",\n",
+    "    \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
+    "    \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
+    "    \"how far is it from paris to boston in miles\",\n",
+    "    \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n",
+    "    \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n",
+    "    \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n",
+    "    \"what is 153 raised to .1312 power?\",\n",
+    "    \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
+    "    \"what is 1213 divided by 4345?\",\n",
    "]\n",
+    "results = []\n",
    "\n",
-    "for input_example in inputs:\n",
+    "async def arun(agent, input_example):\n",
    "    try:\n",
-    "        print(agent.run(input_example))\n",
+    "        return await agent.arun(input_example)\n",
    "    except Exception as e:\n",
    "        # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
    "        print(e)\n",
-    "           "
+    "        return e\n",
+    "for input_example in inputs:\n",
+    "    results.append(arun(agent, input_example))\n",
+    "await asyncio.gather(*results)     "
   ]
  },
  {
@ -217,42 +215,31 @@
   },
   "outputs": [],
   "source": [
-    "dataset_name = \"calculator-example-dataset\""
+    "dataset_name = \"calculator-example-dataset-2\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "c0e12629-bca5-4438-8665-890d0cb9cc4a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "runs = client.list_runs(\n",
-    "        session_name=os.environ[\"LANGCHAIN_SESSION\"],\n",
-    "        run_type=\"chain\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "    dataset = client.create_dataset(dataset_name, description=\"A calculator example dataset\")\n",
-    "    # List all \"Chain\" runs in the current session \n",
-    "    runs = client.list_runs(\n",
-    "        session_name=os.environ[\"LANGCHAIN_SESSION\"],\n",
-    "        run_type=\"chain\")\n",
-    "    for run in runs:\n",
-    "        if run.name == \"AgentExecutor\":\n",
-    "            # We will only use examples from the top level AgentExecutor run here.\n",
-    "            client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
+    "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
+    "    client.delete_dataset(dataset_name=dataset_name)\n",
+    "dataset = client.create_dataset(dataset_name, description=\"A calculator example dataset\")\n",
+    "runs = client.list_runs(\n",
+    "    session_name=os.environ[\"LANGCHAIN_SESSION\"],\n",
+    "    execution_order=1, # Only return the top-level runs\n",
+    "    error=False, # Only runs that succeed\n",
+    ")\n",
+    "for run in runs:\n",
+    "    try:\n",
+    "        client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)\n",
+    "    except:\n",
+    "        pass"
   ]
  },
  {
@ -286,7 +273,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "id": "1baa677c-5642-4378-8e01-3aa1647f19d6",
   "metadata": {
    "tags": []
@ -299,7 +286,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "id": "60d14593-c61f-449f-a38f-772ca43707c2",
   "metadata": {
    "tags": []
@ -317,7 +304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "id": "52a7ea76-79ca-4765-abf7-231e884040d6",
   "metadata": {
    "tags": []
@ -353,7 +340,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b",
   "metadata": {
    "tags": []
@ -381,7 +368,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "id": "112d7bdf-7e50-4c1a-9285-5bac8473f2ee",
   "metadata": {
    "tags": []
@ -418,7 +405,7 @@
       "\n",
       "Returns:\n",
       "    A dictionary mapping example ids to the model outputs.\n",
-       "\u001b[0;31mFile:\u001b[0m      ~/Code/langchain/langchain/client/langchain.py\n",
+       "\u001b[0;31mFile:\u001b[0m      ~/code/lc/lckg/langchain/client/langchain.py\n",
       "\u001b[0;31mType:\u001b[0m      method"
      ]
     },
@ -432,7 +419,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "id": "6e10f823",
   "metadata": {
    "tags": []
@ -442,7 +429,12 @@
    "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
    "# a way to initialize a new chain for each row in the dataset. This is done\n",
    "# by passing in a factory function that returns a new chain for each row.\n",
-    "chain_factory = lambda: initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)\n",
+    "chain_factory = lambda: initialize_agent(\n",
+    "    tools,\n",
+    "    llm,\n",
+    "    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
+    "    verbose=False,\n",
+    ")\n",
    "\n",
    "# If your chain is NOT stateful, your lambda can return the object directly\n",
    "# to improve runtime performance. For example:\n",
@ -451,7 +443,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
   "metadata": {
    "tags": []
@ -461,314 +453,85 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processed examples: 1\r"
+      "Processed examples: 4\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Chain failed for example 604fbd32-7cbe-4dd4-9ddd-fd5ab5c01566. Error: LLMMathChain._evaluate(\"\n",
-      "(age ** 0.43)\n",
-      "\") raised error: 'age'. Please try again with a valid numerical expression\n"
+      "Chain failed for example 898af6aa-ea39-4959-9ecd-9b9f1ffee31c. Error: LLMMathChain._evaluate(\"\n",
+      "round(0.2791714614499425, 2)\n",
+      "\") raised error: 'VariableNode' object is not callable. Please try again with a valid numerical expression\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processed examples: 4\r"
+      "Processed examples: 5\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Chain failed for example 4c82b6a4-d8ce-4129-8229-7f4e2f76294c. Error: LLMMathChain._evaluate(\"\n",
-      "(total number of points scored in the 2023 super bowl)**0.23\n",
-      "\") raised error: invalid syntax. Perhaps you forgot a comma? (<expr>, line 1). Please try again with a valid numerical expression\n"
+      "Chain failed for example ffb8071d-60e4-49ca-aa9f-5ec03ea78f2d. Error: unknown format from LLM: This is not a math problem and cannot be translated into a mathematical expression.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processed examples: 10\r"
+      "Processed examples: 6\r"
     ]
-    }
-   ],
-   "source": [
-    "chain_results = await client.arun_on_dataset(\n",
-    "    dataset_name=dataset_name,\n",
-    "    llm_or_chain_factory=chain_factory,\n",
-    "    concurrency_level=5, # Optional, sets the number of examples to run at a time\n",
-    "    verbose=True\n",
-    ")\n",
-    "\n",
-    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
-    "# These are logged as warnings here and captured as errors in the tracing UI."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d2737458-b20c-4288-8790-1f4a8d237b2a",
-   "metadata": {},
-   "source": [
-    "## Reviewing the Chain Results\n",
-    "\n",
-    "You can review the results of the run in the tracing UI below and navigating to the session \n",
-    "with the title 'calculator-example-dataset-AgentExecutor-YYYY-MM-DD-HH-MM-SS'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "136db492-d6ca-4215-96f9-439c23538241",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<a href=\"http://localhost\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-      ],
-      "text/plain": [
-       "LangChainPlusClient (API URL: http://localhost:8000)"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# You can navigate to the UI by clicking on the link below\n",
-    "client"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c70cceb5-aa53-4851-bb12-386f092191f9",
-   "metadata": {},
-   "source": [
-    "### Running a Chat Model over a Traced Dataset\n",
-    "\n",
-    "We've shown how to run a _chain_ over a dataset, but you can also run an LLM or Chat model over a datasets formed from runs. \n",
-    "\n",
-    "First, we'll show an example using a ChatModel. This is useful for things like:\n",
-    "- Comparing results under different decoding parameters\n",
-    "- Comparing model providers\n",
-    "- Testing for regressions in model behavior\n",
-    "- Running multiple times with a temperature to gauge stability \n",
-    "\n",
-    "To speed things up, we'll upload a dataset we've previously captured directly to the tracing service."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "64490d7c-9a18-49ed-a3ac-36049c522cb4",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
+    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--two-player-dnd-cc62c3037e2d9250/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
+      "Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 29fc448d09a0f240719eb1dbb95db18d in your message.).\n"
     ]
    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "44f3c72015944e2ea4c39516350ea15c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>generations</th>\n",
-       "      <th>messages</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[[{'generation_info': None, 'message': {'conte...</td>\n",
-       "      <td>[{'data': {'content': 'Here is the topic for a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[[{'generation_info': None, 'message': {'conte...</td>\n",
-       "      <td>[{'data': {'content': 'Here is the topic for a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[[{'generation_info': None, 'message': {'conte...</td>\n",
-       "      <td>[{'data': {'content': 'Here is the topic for a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[[{'generation_info': None, 'message': {'conte...</td>\n",
-       "      <td>[{'data': {'content': 'Here is the topic for a...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[[{'generation_info': None, 'message': {'conte...</td>\n",
-       "      <td>[{'data': {'content': 'Here is the topic for a...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                         generations  \\\n",
-       "0  [[{'generation_info': None, 'message': {'conte...   \n",
-       "1  [[{'generation_info': None, 'message': {'conte...   \n",
-       "2  [[{'generation_info': None, 'message': {'conte...   \n",
-       "3  [[{'generation_info': None, 'message': {'conte...   \n",
-       "4  [[{'generation_info': None, 'message': {'conte...   \n",
-       "\n",
-       "                                            messages  \n",
-       "0  [{'data': {'content': 'Here is the topic for a...  \n",
-       "1  [{'data': {'content': 'Here is the topic for a...  \n",
-       "2  [{'data': {'content': 'Here is the topic for a...  \n",
-       "3  [{'data': {'content': 'Here is the topic for a...  \n",
-       "4  [{'data': {'content': 'Here is the topic for a...  "
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import pandas as pd\n",
-    "from langchain.evaluation.loading import load_dataset\n",
-    "\n",
-    "chat_dataset = load_dataset(\"two-player-dnd\")\n",
-    "chat_df = pd.DataFrame(chat_dataset)\n",
-    "chat_df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "348acd86-a927-4d60-8d52-02e64585e4fc",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "chat_dataset_name = \"two-player-dnd\"\n",
-    "\n",
-    "if chat_dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "    client.upload_dataframe(chat_df, \n",
-    "                            name=chat_dataset_name,\n",
-    "                            description=\"An example dataset traced from chat models in a multiagent bidding dialogue\",\n",
-    "                            input_keys=[\"messages\"],\n",
-    "                            output_keys=[\"generations\"],\n",
-    "                   )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "927a43b8-e4f9-4220-b75d-33e310bc318b",
-   "metadata": {},
-   "source": [
-    "#### Reviewing behavior with temperature\n",
-    "\n",
-    "Here, we will set `num_repetitions > 1` and set the temperature to 0.3 to see the variety of response types for a each example.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "id": "a69dd183-ad5e-473d-b631-db90706e837f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatAnthropic\n",
-    "\n",
-    "chat_model = ChatAnthropic(temperature=.3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "id": "063da2a9-3692-4b7b-8edb-e474824fe416",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Processed examples: 36\r"
+      "Processed examples: 7\r"
     ]
    }
   ],
   "source": [
-    "chat_model_results = await client.arun_on_dataset(\n",
-    "    dataset_name=chat_dataset_name,\n",
-    "    llm_or_chain_factory=chat_model,\n",
+    "evaluation_session_name = \"Search + Calculator Agent Evaluation\"\n",
+    "chain_results = await client.arun_on_dataset(\n",
+    "    dataset_name=dataset_name,\n",
+    "    llm_or_chain_factory=chain_factory,\n",
    "    concurrency_level=5, # Optional, sets the number of examples to run at a time\n",
-    "    num_repetitions=3,\n",
-    "    verbose=True\n",
+    "    verbose=True,\n",
+    "    session_name=evaluation_session_name # Optional, a unique session name will be generated if not provided\n",
    ")\n",
    "\n",
-    "# The 'experimental tracing v2' warning is expected, as we are still actively developing the v2 tracing API \n",
-    "# Since we are running examples concurrently,  you may run into some RateLimit warnings from your model\n",
-    "# provider. In most cases, the tests will still run to completion (the wrappers have backoff)."
+    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
+    "# These are logged as warnings here and captured as errors in the tracing UI."
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "de7bfe08-215c-4328-b9b0-631d9a41f0e8",
+   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
   "metadata": {
    "tags": []
   },
   "source": [
-    "## Reviewing the Chat Model Results\n",
+    "### Reviewing the Chain Results\n",
    "\n",
-    "You can review the latest runs by clicking on the link below and navigating to the \"two-player-dnd\" session."
+    "You can review the results of the run in the tracing UI below and navigating to the session \n",
+    "with the title **\"Search + Calculator Agent Evaluation\"**"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
-   "id": "5b7a81f2-d19d-438b-a4bb-5678f746b965",
+   "execution_count": 13,
+   "id": "136db492-d6ca-4215-96f9-439c23538241",
   "metadata": {
    "tags": []
   },
@ -776,229 +539,88 @@
    {
     "data": {
      "text/html": [
-       "<a href=\"http://localhost\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: http://localhost:8000)"
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
      ]
     },
-     "execution_count": 20,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
+    "# You can navigate to the UI by clicking on the link below\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "7896cbeb-345f-430b-ab5e-e108973174f8",
+   "id": "63ed6561-6574-43b3-a653-fe410aa8a617",
   "metadata": {},
   "source": [
-    "## Running an LLM over a Traced Dataset\n",
+    "## Running an Evaluation Chain\n",
    "\n",
-    "You can run an LLM over a dataset in much the same way as the chain and chat models, provided the dataset you've captured is in the appropriate format. We've cached one for you here, but using application-specific traces will be much more useful for your use cases."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "d6805d0b-4612-4671-bffb-e6978992bd40",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.llms import OpenAI\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
    "\n",
-    "llm = OpenAI(model_name='text-curie-001', temperature=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "id": "5d7cb243-40c3-44dd-8158-a7b910441e9f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--state-of-the-union-completions-5347290a406c64c8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ce2168f975241fbae82a76b4d70e4c4",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "  0%|          | 0/1 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>generations</th>\n",
-       "      <th>ground_truth</th>\n",
-       "      <th>prompt</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[[{'generation_info': {'finish_reason': 'stop'...</td>\n",
-       "      <td>The pandemic has been punishing. \\n\\nAnd so ma...</td>\n",
-       "      <td>Putin may circle Kyiv with tanks, but he will ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[[]]</td>\n",
-       "      <td>With a duty to one another to the American peo...</td>\n",
-       "      <td>Madam Speaker, Madam Vice President, our First...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[[{'generation_info': {'finish_reason': 'stop'...</td>\n",
-       "      <td>He thought he could roll into Ukraine and the ...</td>\n",
-       "      <td>With a duty to one another to the American peo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[[]]</td>\n",
-       "      <td>And the costs and the threats to America and t...</td>\n",
-       "      <td>Please rise if you are able and show that, Yes...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[[{'generation_info': {'finish_reason': 'stop'...</td>\n",
-       "      <td>Please rise if you are able and show that, Yes...</td>\n",
-       "      <td>Groups of citizens blocking tanks with their b...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                         generations  \\\n",
-       "0  [[{'generation_info': {'finish_reason': 'stop'...   \n",
-       "1                                               [[]]   \n",
-       "2  [[{'generation_info': {'finish_reason': 'stop'...   \n",
-       "3                                               [[]]   \n",
-       "4  [[{'generation_info': {'finish_reason': 'stop'...   \n",
-       "\n",
-       "                                        ground_truth  \\\n",
-       "0  The pandemic has been punishing. \\n\\nAnd so ma...   \n",
-       "1  With a duty to one another to the American peo...   \n",
-       "2  He thought he could roll into Ukraine and the ...   \n",
-       "3  And the costs and the threats to America and t...   \n",
-       "4  Please rise if you are able and show that, Yes...   \n",
-       "\n",
-       "                                              prompt  \n",
-       "0  Putin may circle Kyiv with tanks, but he will ...  \n",
-       "1  Madam Speaker, Madam Vice President, our First...  \n",
-       "2  With a duty to one another to the American peo...  \n",
-       "3  Please rise if you are able and show that, Yes...  \n",
-       "4  Groups of citizens blocking tanks with their b...  "
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "completions_dataset = load_dataset(\"state-of-the-union-completions\")\n",
-    "completions_df = pd.DataFrame(completions_dataset)\n",
-    "completions_df.head()"
+    "A few ways of doing this include:\n",
+    "- Adding ground-truth answers as outputs to the dataset and evaluating relative to those references.\n",
+    "- Evaluating the overall agent trajectory based on the tool usage and intermediate steps.\n",
+    "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
+    "- Evaluating 'aspects' of the agent's response in a reference-free manner using targeted agent prompts.\n",
+    "    \n",
+    "Below, we show how to run an evaluation chain that compares the model output with the ground-truth answers.\n",
+    "\n",
+    "**Note: the feedback API is currently experimental and subject to change.**"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
-   "id": "c7dcc1b2-7aef-44c0-ba0f-c812279099a5",
+   "execution_count": 14,
+   "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-    "completions_dataset_name = \"state-of-the-union-completions\"\n",
+    "from langchain.evaluation.qa import QAEvalChain\n",
+    "\n",
+    "eval_llm = ChatOpenAI(model=\"gpt-4\")\n",
+    "chain = QAEvalChain.from_llm(eval_llm)\n",
+    "\n",
+    "examples = []\n",
+    "predictions = []\n",
+    "run_ids = []\n",
+    "for run in client.list_runs(session_name=evaluation_session_name, execution_order=1, error=False):\n",
+    "    if run.reference_example_id is None or not run.outputs:\n",
+    "        continue\n",
+    "    run_ids.append(run.id)\n",
+    "    example = client.read_example(run.reference_example_id)\n",
+    "    examples.append({**run.inputs, **example.outputs})\n",
+    "    predictions.append(\n",
+    "        run.outputs\n",
+    "    )\n",
+    "    \n",
+    "evaluation_results = chain.evaluate(\n",
+    "    examples,\n",
+    "    predictions,\n",
+    "    question_key=\"input\",\n",
+    "    answer_key=\"output\",\n",
+    "    prediction_key=\"output\"\n",
+    ")\n",
    "\n",
-    "if completions_dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-    "    client.upload_dataframe(completions_df, \n",
-    "                            name=completions_dataset_name,\n",
-    "                            description=\"An example dataset traced from completion endpoints over the state of the union address\",\n",
-    "                            input_keys=[\"prompt\"],\n",
-    "                            output_keys=[\"generations\"],\n",
-    "                   )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "e946138e-bf7c-43d7-861d-9c5740c933fa",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "50 processed\r"
-     ]
-    }
-   ],
-   "source": [
-    "# We also offer a synchronous method for running examples if a chain or llm's async methods aren't yet implemented\n",
-    "completions_model_results = client.run_on_dataset(\n",
-    "    dataset_name=completions_dataset_name,\n",
-    "    llm_or_chain_factory=llm,\n",
-    "    num_repetitions=1,\n",
-    "    verbose=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cc86e8e6-cee2-429e-942b-289284d14816",
-   "metadata": {},
-   "source": [
-    "## Reviewing the LLM Results\n",
    "\n",
-    "You can once again inspect the latest runs by clicking on the link below and navigating to the \"two-player-dnd\" session."
+    "for run_id, result in zip(run_ids, evaluation_results):\n",
+    "    score = {\"CORRECT\": 1, \"INCORRECT\": 0}.get(result[\"text\"], 0)\n",
+    "    client.create_feedback(run_id, \"Accuracy\", score=score)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
-   "id": "2bf96f17-74c1-4f7d-8458-ae5ab5c6bd36",
+   "execution_count": 15,
+   "id": "8696f167-dc75-4ef8-8bb3-ac1ce8324f30",
   "metadata": {
    "tags": []
   },
@ -1006,13 +628,13 @@
    {
     "data": {
      "text/html": [
-       "<a href=\"http://localhost\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
      ],
      "text/plain": [
-       "LangChainPlusClient (API URL: http://localhost:8000)"
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -1024,7 +646,7 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "df80cd88-cd6f-4fdc-965f-f74600e1f286",
+   "id": "daf7dc7f-a5b0-49be-a695-2a87e283e588",
   "metadata": {},
   "outputs": [],
   "source": []
@ -1046,7 +668,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.11.2"
  }
 },
 "nbformat": 4,
--- a/tests/integration_tests/client/test_client.py
+++ b/tests/integration_tests/client/test_client.py
@ -1,10 +1,13 @@
 """LangChain+ langchain_client Integration Tests."""
+import os
 from uuid import uuid4

 import pytest
 from tenacity import RetryError

+from langchain.agents import AgentType, initialize_agent, load_tools
 from langchain.callbacks.manager import tracing_v2_enabled
+from langchain.chat_models import ChatOpenAI
 from langchain.client import LangChainPlusClient
 from langchain.tools.base import tool

@ -50,3 +53,64 @@ def test_sessions(
        langchain_client.delete_session(session_name=new_session)
    with pytest.raises(RetryError):
        langchain_client.read_run(run_id=str(runs[0].id))
+
+
+def test_feedback_cycle(
+    monkeypatch: pytest.MonkeyPatch, langchain_client: LangChainPlusClient
+) -> None:
+    """Test that feedback is correctly created and updated."""
+    monkeypatch.setenv("LANGCHAIN_TRACING_V2", "true")
+    monkeypatch.setenv("LANGCHAIN_SESSION", f"Feedback Testing {uuid4()}")
+    monkeypatch.setenv("LANGCHAIN_ENDPOINT", "http://localhost:1984")
+    llm = ChatOpenAI(temperature=0)
+    tools = load_tools(["serpapi", "llm-math"], llm=llm)
+    agent = initialize_agent(
+        tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False
+    )
+
+    agent.run(
+        "What is the population of Kuala Lumpur as of January, 2023?"
+        " What is it's square root?"
+    )
+    other_session_name = f"Feedback Testing {uuid4()}"
+    with tracing_v2_enabled(session_name=other_session_name):
+        try:
+            agent.run("What is the square root of 3?")
+        except Exception as e:
+            print(e)
+    runs = list(
+        langchain_client.list_runs(
+            session_name=os.environ["LANGCHAIN_SESSION"], error=False, execution_order=1
+        )
+    )
+    assert len(runs) == 1
+    order_2 = list(
+        langchain_client.list_runs(
+            session_name=os.environ["LANGCHAIN_SESSION"], execution_order=2
+        )
+    )
+    assert len(order_2) > 0
+    langchain_client.create_feedback(str(order_2[0].id), "test score", score=0)
+    feedback = langchain_client.create_feedback(str(runs[0].id), "test score", score=1)
+    feedbacks = list(langchain_client.list_feedback(run_ids=[str(runs[0].id)]))
+    assert len(feedbacks) == 1
+    assert feedbacks[0].id == feedback.id
+
+    # Add feedback to other session
+    other_runs = list(
+        langchain_client.list_runs(session_name=other_session_name, execution_order=1)
+    )
+    assert len(other_runs) == 1
+    langchain_client.create_feedback(
+        run_id=str(other_runs[0].id), key="test score", score=0
+    )
+    all_runs = list(
+        langchain_client.list_runs(session_name=os.environ["LANGCHAIN_SESSION"])
+    ) + list(langchain_client.list_runs(session_name=other_session_name))
+    test_run_ids = [str(run.id) for run in all_runs]
+    all_feedback = list(langchain_client.list_feedback(run_ids=test_run_ids))
+    assert len(all_feedback) == 3
+    for feedback in all_feedback:
+        langchain_client.delete_feedback(str(feedback.id))
+    feedbacks = list(langchain_client.list_feedback(run_ids=test_run_ids))
+    assert len(feedbacks) == 0