From 1f4a51cb9c84c6a061cda729c4cdd3783d98922c Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Thu, 6 Jul 2023 13:33:33 -0700 Subject: [PATCH] Add Agent Trajectory Interface (#7122) --- langchain/evaluation/__init__.py | 7 +- .../agents/trajectory_eval_chain.py | 29 +++-- langchain/evaluation/schema.py | 121 +++++++++++++++++- 3 files changed, 143 insertions(+), 14 deletions(-) diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py index 1d88ae4b0e..b6dfc5027c 100644 --- a/langchain/evaluation/__init__.py +++ b/langchain/evaluation/__init__.py @@ -21,7 +21,11 @@ from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChai from langchain.evaluation.comparison import PairwiseStringEvalChain from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain -from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator +from langchain.evaluation.schema import ( + AgentTrajectoryEvaluator, + PairwiseStringEvaluator, + StringEvaluator, +) __all__ = [ "PairwiseStringEvalChain", @@ -32,4 +36,5 @@ __all__ = [ "PairwiseStringEvaluator", "TrajectoryEvalChain", "CriteriaEvalChain", + "AgentTrajectoryEvaluator", ] diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py index 0987dd850d..dfd9a44c1e 100644 --- a/langchain/evaluation/agents/trajectory_eval_chain.py +++ b/langchain/evaluation/agents/trajectory_eval_chain.py @@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import ( EVAL_CHAT_PROMPT, TOOL_FREE_EVAL_CHAT_PROMPT, ) +from langchain.evaluation.schema import AgentTrajectoryEvaluator from langchain.schema import AgentAction, BaseOutputParser, OutputParserException from langchain.tools.base import BaseTool @@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser): return TrajectoryEval(score=int(score_str), reasoning=reasoning) -class TrajectoryEvalChain(Chain): +class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain): """A chain for evaluating ReAct style agents. This chain is used to evaluate ReAct style agents by reasoning about @@ -142,7 +143,9 @@ Description: {tool.description}""" ) @staticmethod - def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str: + def get_agent_trajectory( + steps: Union[str, Sequence[Tuple[AgentAction, str]]] + ) -> str: """Get the agent trajectory as a formatted string. Args: @@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness: return {"score": parsed_output.score} - def evaluate_agent_trajectory( + def _evaluate_agent_trajectory( self, *, prediction: str, input: str, - agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], + agent_trajectory: Sequence[Tuple[AgentAction, str]], reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, @@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness: """Evaluate a trajectory. Args: - input (str): The input question. - agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): + prediction (str): The final predicted response. + input (str): The input to the agent. + agent_trajectory (List[Tuple[AgentAction, str]]): The intermediate steps forming the agent trajectory. - prediction (str): The expected prediction. reference (Optional[str]): The reference answer. + callbacks (Callbacks): Callbacks to use for this chain run. Returns: dict: The evaluation result. @@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness: } return self(inputs=inputs, callbacks=callbacks, **kwargs) - async def aevaluate_agent_trajectory( + async def _aevaluate_agent_trajectory( self, *, prediction: str, input: str, - agent_trajectory: Union[str, List[Tuple[AgentAction, str]]], + agent_trajectory: Sequence[Tuple[AgentAction, str]], reference: Optional[str] = None, callbacks: Callbacks = None, **kwargs: Any, @@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness: """Asynchronously evaluate a trajectory. Args: - input (str): The input question. - agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]): + prediction (str): The final predicted response. + input (str): The input to the agent. + agent_trajectory (List[Tuple[AgentAction, str]]): The intermediate steps forming the agent trajectory. - prediction (str): The expected prediction. reference (Optional[str]): The reference answer. + callbacks (Callbacks): Callbacks to use for this chain run. Returns: dict: The evaluation result. diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index 4bcfc51307..bd6351a5c3 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -3,9 +3,11 @@ from __future__ import annotations import logging from abc import ABC, abstractmethod -from typing import Any, Optional +from typing import Any, Optional, Sequence, Tuple from warnings import warn +from langchain.schema.agent import AgentAction + logger = logging.getLogger(__name__) @@ -275,3 +277,120 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC): input=input, **kwargs, ) + + +class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC): + """Interface for evaluating agent trajectories.""" + + @property + def requires_input(self) -> bool: + return True + + @abstractmethod + def _evaluate_agent_trajectory( + self, + *, + prediction: str, + agent_trajectory: Sequence[Tuple[AgentAction, str]], + input: str, + reference: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate a trajectory. + + Args: + prediction (str): The final predicted response. + agent_trajectory (List[Tuple[AgentAction, str]]): + The intermediate steps forming the agent trajectory. + input (str): The input to the agent. + reference (Optional[str]): The reference answer. + + Returns: + dict: The evaluation result. + """ + + async def _aevaluate_agent_trajectory( + self, + *, + prediction: str, + agent_trajectory: Sequence[Tuple[AgentAction, str]], + input: str, + reference: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate a trajectory. + + Args: + prediction (str): The final predicted response. + agent_trajectory (List[Tuple[AgentAction, str]]): + The intermediate steps forming the agent trajectory. + input (str): The input to the agent. + reference (Optional[str]): The reference answer. + + Returns: + dict: The evaluation result. + """ + raise NotImplementedError( + f"{self.__class__.__name__} hasn't implemented an async " + "aevaluate_agent_trajectory method." + ) + + def evaluate_agent_trajectory( + self, + *, + prediction: str, + agent_trajectory: Sequence[Tuple[AgentAction, str]], + input: str, + reference: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Evaluate a trajectory. + + Args: + prediction (str): The final predicted response. + agent_trajectory (List[Tuple[AgentAction, str]]): + The intermediate steps forming the agent trajectory. + input (str): The input to the agent. + reference (Optional[str]): The reference answer. + + Returns: + dict: The evaluation result. + """ + self._check_evaluation_args(reference=reference, input=input) + return self._evaluate_agent_trajectory( + prediction=prediction, + input=input, + agent_trajectory=agent_trajectory, + reference=reference, + **kwargs, + ) + + async def aevaluate_agent_trajectory( + self, + *, + prediction: str, + agent_trajectory: Sequence[Tuple[AgentAction, str]], + input: str, + reference: Optional[str] = None, + **kwargs: Any, + ) -> dict: + """Asynchronously evaluate a trajectory. + + Args: + prediction (str): The final predicted response. + agent_trajectory (List[Tuple[AgentAction, str]]): + The intermediate steps forming the agent trajectory. + input (str): The input to the agent. + reference (Optional[str]): The reference answer. + + Returns: + dict: The evaluation result. + """ + self._check_evaluation_args(reference=reference, input=input) + return await self._aevaluate_agent_trajectory( + prediction=prediction, + input=input, + agent_trajectory=agent_trajectory, + reference=reference, + **kwargs, + )