Add Agent Trajectory Interface (#7122)

This commit is contained in:
William FH 2023-07-06 13:33:33 -07:00 committed by GitHub
parent a6b39afe0e
commit 1f4a51cb9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 143 additions and 14 deletions

View File

@ -21,7 +21,11 @@ from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChai
from langchain.evaluation.comparison import PairwiseStringEvalChain
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
from langchain.evaluation.schema import (
AgentTrajectoryEvaluator,
PairwiseStringEvaluator,
StringEvaluator,
)
__all__ = [
"PairwiseStringEvalChain",
@ -32,4 +36,5 @@ __all__ = [
"PairwiseStringEvaluator",
"TrajectoryEvalChain",
"CriteriaEvalChain",
"AgentTrajectoryEvaluator",
]

View File

@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import (
EVAL_CHAT_PROMPT,
TOOL_FREE_EVAL_CHAT_PROMPT,
)
from langchain.evaluation.schema import AgentTrajectoryEvaluator
from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
from langchain.tools.base import BaseTool
@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser):
return TrajectoryEval(score=int(score_str), reasoning=reasoning)
class TrajectoryEvalChain(Chain):
class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain):
"""A chain for evaluating ReAct style agents.
This chain is used to evaluate ReAct style agents by reasoning about
@ -142,7 +143,9 @@ Description: {tool.description}"""
)
@staticmethod
def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
def get_agent_trajectory(
steps: Union[str, Sequence[Tuple[AgentAction, str]]]
) -> str:
"""Get the agent trajectory as a formatted string.
Args:
@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness:
return {"score": parsed_output.score}
def evaluate_agent_trajectory(
def _evaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness:
"""Evaluate a trajectory.
Args:
input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
prediction (str): The final predicted response.
input (str): The input to the agent.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
prediction (str): The expected prediction.
reference (Optional[str]): The reference answer.
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result.
@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness:
}
return self(inputs=inputs, callbacks=callbacks, **kwargs)
async def aevaluate_agent_trajectory(
async def _aevaluate_agent_trajectory(
self,
*,
prediction: str,
input: str,
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
agent_trajectory: Sequence[Tuple[AgentAction, str]],
reference: Optional[str] = None,
callbacks: Callbacks = None,
**kwargs: Any,
@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness:
"""Asynchronously evaluate a trajectory.
Args:
input (str): The input question.
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
prediction (str): The final predicted response.
input (str): The input to the agent.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
prediction (str): The expected prediction.
reference (Optional[str]): The reference answer.
callbacks (Callbacks): Callbacks to use for this chain run.
Returns:
dict: The evaluation result.

View File

@ -3,9 +3,11 @@ from __future__ import annotations
import logging
from abc import ABC, abstractmethod
from typing import Any, Optional
from typing import Any, Optional, Sequence, Tuple
from warnings import warn
from langchain.schema.agent import AgentAction
logger = logging.getLogger(__name__)
@ -275,3 +277,120 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
input=input,
**kwargs,
)
class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
"""Interface for evaluating agent trajectories."""
@property
def requires_input(self) -> bool:
return True
@abstractmethod
def _evaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
async def _aevaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_agent_trajectory method."
)
def evaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_agent_trajectory(
prediction=prediction,
input=input,
agent_trajectory=agent_trajectory,
reference=reference,
**kwargs,
)
async def aevaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_agent_trajectory(
prediction=prediction,
input=input,
agent_trajectory=agent_trajectory,
reference=reference,
**kwargs,
)