From 1f4a51cb9c84c6a061cda729c4cdd3783d98922c Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Thu, 6 Jul 2023 13:33:33 -0700
Subject: [PATCH] Add Agent Trajectory Interface (#7122)

---
 langchain/evaluation/__init__.py              |   7 +-
 .../agents/trajectory_eval_chain.py           |  29 +++--
 langchain/evaluation/schema.py                | 121 +++++++++++++++++-
 3 files changed, 143 insertions(+), 14 deletions(-)

diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py
index 1d88ae4b0e..b6dfc5027c 100644
--- a/langchain/evaluation/__init__.py
+++ b/langchain/evaluation/__init__.py
@@ -21,7 +21,11 @@ from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChai
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+from langchain.evaluation.schema import (
+    AgentTrajectoryEvaluator,
+    PairwiseStringEvaluator,
+    StringEvaluator,
+)
 
 __all__ = [
     "PairwiseStringEvalChain",
@@ -32,4 +36,5 @@ __all__ = [
     "PairwiseStringEvaluator",
     "TrajectoryEvalChain",
     "CriteriaEvalChain",
+    "AgentTrajectoryEvaluator",
 ]
diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py
index 0987dd850d..dfd9a44c1e 100644
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import (
     EVAL_CHAT_PROMPT,
     TOOL_FREE_EVAL_CHAT_PROMPT,
 )
+from langchain.evaluation.schema import AgentTrajectoryEvaluator
 from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
 from langchain.tools.base import BaseTool
 
@@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser):
         return TrajectoryEval(score=int(score_str), reasoning=reasoning)
 
 
-class TrajectoryEvalChain(Chain):
+class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain):
     """A chain for evaluating ReAct style agents.
 
     This chain is used to evaluate ReAct style agents by reasoning about
@@ -142,7 +143,9 @@ Description: {tool.description}"""
         )
 
     @staticmethod
-    def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
+    def get_agent_trajectory(
+        steps: Union[str, Sequence[Tuple[AgentAction, str]]]
+    ) -> str:
         """Get the agent trajectory as a formatted string.
 
         Args:
@@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness:
 
         return {"score": parsed_output.score}
 
-    def evaluate_agent_trajectory(
+    def _evaluate_agent_trajectory(
         self,
         *,
         prediction: str,
         input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
         **kwargs: Any,
@@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness:
         """Evaluate a trajectory.
 
         Args:
-            input (str): The input question.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            prediction (str): The final predicted response.
+            input (str): The input to the agent.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
                 The intermediate steps forming the agent trajectory.
-            prediction (str): The expected prediction.
             reference (Optional[str]): The reference answer.
+            callbacks (Callbacks): Callbacks to use for this chain run.
 
         Returns:
             dict: The evaluation result.
@@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness:
         }
         return self(inputs=inputs, callbacks=callbacks, **kwargs)
 
-    async def aevaluate_agent_trajectory(
+    async def _aevaluate_agent_trajectory(
         self,
         *,
         prediction: str,
         input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
         reference: Optional[str] = None,
         callbacks: Callbacks = None,
         **kwargs: Any,
@@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness:
         """Asynchronously evaluate a trajectory.
 
         Args:
-            input (str): The input question.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            prediction (str): The final predicted response.
+            input (str): The input to the agent.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
                 The intermediate steps forming the agent trajectory.
-            prediction (str): The expected prediction.
             reference (Optional[str]): The reference answer.
+            callbacks (Callbacks): Callbacks to use for this chain run.
 
         Returns:
             dict: The evaluation result.
diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py
index 4bcfc51307..bd6351a5c3 100644
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -3,9 +3,11 @@ from __future__ import annotations
 
 import logging
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, Sequence, Tuple
 from warnings import warn
 
+from langchain.schema.agent import AgentAction
+
 logger = logging.getLogger(__name__)
 
 
@@ -275,3 +277,120 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
             input=input,
             **kwargs,
         )
+
+
+class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
+    """Interface for evaluating agent trajectories."""
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
+    @abstractmethod
+    def _evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+
+    async def _aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} hasn't implemented an async "
+            "aevaluate_agent_trajectory method."
+        )
+
+    def evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )
+
+    async def aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )