Add Agent Trajectory Interface (#7122)

1 year ago · 1f4a51cb9c
parent a6b39afe0e
commit 1f4a51cb9c
3 changed files with 143 additions and 14 deletions
--- a/langchain/evaluation/init.py
+++ b/langchain/evaluation/init.py
@ -21,7 +21,11 @@ from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChai
 from langchain.evaluation.comparison import PairwiseStringEvalChain
 from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
 from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
-from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
+from langchain.evaluation.schema import (
+    AgentTrajectoryEvaluator,
+    PairwiseStringEvaluator,
+    StringEvaluator,
+)

 __all__ = [
    "PairwiseStringEvalChain",
@ -32,4 +36,5 @@ __all__ = [
    "PairwiseStringEvaluator",
    "TrajectoryEvalChain",
    "CriteriaEvalChain",
+    "AgentTrajectoryEvaluator",
 ]
--- a/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/langchain/evaluation/agents/trajectory_eval_chain.py
@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import (
    EVAL_CHAT_PROMPT,
    TOOL_FREE_EVAL_CHAT_PROMPT,
 )
+from langchain.evaluation.schema import AgentTrajectoryEvaluator
 from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
 from langchain.tools.base import BaseTool

@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser):
        return TrajectoryEval(score=int(score_str), reasoning=reasoning)


-class TrajectoryEvalChain(Chain):
+class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain):
    """A chain for evaluating ReAct style agents.

    This chain is used to evaluate ReAct style agents by reasoning about
@ -142,7 +143,9 @@ Description: {tool.description}"""
        )

    @staticmethod
-    def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
+    def get_agent_trajectory(
+        steps: Union[str, Sequence[Tuple[AgentAction, str]]]
+    ) -> str:
        """Get the agent trajectory as a formatted string.

        Args:
@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness:

        return {"score": parsed_output.score}

-    def evaluate_agent_trajectory(
+    def _evaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness:
        """Evaluate a trajectory.

        Args:
-            input (str): The input question.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            prediction (str): The final predicted response.
+            input (str): The input to the agent.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
-            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
+            callbacks (Callbacks): Callbacks to use for this chain run.

        Returns:
            dict: The evaluation result.
@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness:
        }
        return self(inputs=inputs, callbacks=callbacks, **kwargs)

-    async def aevaluate_agent_trajectory(
+    async def _aevaluate_agent_trajectory(
        self,
        *,
        prediction: str,
        input: str,
-        agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
        reference: Optional[str] = None,
        callbacks: Callbacks = None,
        **kwargs: Any,
@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness:
        """Asynchronously evaluate a trajectory.

        Args:
-            input (str): The input question.
-            agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
+            prediction (str): The final predicted response.
+            input (str): The input to the agent.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
                The intermediate steps forming the agent trajectory.
-            prediction (str): The expected prediction.
            reference (Optional[str]): The reference answer.
+            callbacks (Callbacks): Callbacks to use for this chain run.

        Returns:
            dict: The evaluation result.
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@ -3,9 +3,11 @@ from __future__ import annotations

 import logging
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, Optional, Sequence, Tuple
 from warnings import warn

+from langchain.schema.agent import AgentAction
+
 logger = logging.getLogger(__name__)


@ -275,3 +277,120 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
            input=input,
            **kwargs,
        )
+
+
+class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
+    """Interface for evaluating agent trajectories."""
+
+    @property
+    def requires_input(self) -> bool:
+        return True
+
+    @abstractmethod
+    def _evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+
+    async def _aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} hasn't implemented an async "
+            "aevaluate_agent_trajectory method."
+        )
+
+    def evaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return self._evaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )
+
+    async def aevaluate_agent_trajectory(
+        self,
+        *,
+        prediction: str,
+        agent_trajectory: Sequence[Tuple[AgentAction, str]],
+        input: str,
+        reference: Optional[str] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """Asynchronously evaluate a trajectory.
+
+        Args:
+            prediction (str): The final predicted response.
+            agent_trajectory (List[Tuple[AgentAction, str]]):
+                The intermediate steps forming the agent trajectory.
+            input (str): The input to the agent.
+            reference (Optional[str]): The reference answer.
+
+        Returns:
+            dict: The evaluation result.
+        """
+        self._check_evaluation_args(reference=reference, input=input)
+        return await self._aevaluate_agent_trajectory(
+            prediction=prediction,
+            input=input,
+            agent_trajectory=agent_trajectory,
+            reference=reference,
+            **kwargs,
+        )