|
|
|
@ -22,6 +22,7 @@ from langchain.evaluation.agents.trajectory_eval_prompt import (
|
|
|
|
|
EVAL_CHAT_PROMPT,
|
|
|
|
|
TOOL_FREE_EVAL_CHAT_PROMPT,
|
|
|
|
|
)
|
|
|
|
|
from langchain.evaluation.schema import AgentTrajectoryEvaluator
|
|
|
|
|
from langchain.schema import AgentAction, BaseOutputParser, OutputParserException
|
|
|
|
|
from langchain.tools.base import BaseTool
|
|
|
|
|
|
|
|
|
@ -70,7 +71,7 @@ class TrajectoryOutputParser(BaseOutputParser):
|
|
|
|
|
return TrajectoryEval(score=int(score_str), reasoning=reasoning)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TrajectoryEvalChain(Chain):
|
|
|
|
|
class TrajectoryEvalChain(AgentTrajectoryEvaluator, Chain):
|
|
|
|
|
"""A chain for evaluating ReAct style agents.
|
|
|
|
|
|
|
|
|
|
This chain is used to evaluate ReAct style agents by reasoning about
|
|
|
|
@ -142,7 +143,9 @@ Description: {tool.description}"""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def get_agent_trajectory(steps: Union[str, List[Tuple[AgentAction, str]]]) -> str:
|
|
|
|
|
def get_agent_trajectory(
|
|
|
|
|
steps: Union[str, Sequence[Tuple[AgentAction, str]]]
|
|
|
|
|
) -> str:
|
|
|
|
|
"""Get the agent trajectory as a formatted string.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
@ -308,12 +311,12 @@ The following is the expected answer. Use this to measure correctness:
|
|
|
|
|
|
|
|
|
|
return {"score": parsed_output.score}
|
|
|
|
|
|
|
|
|
|
def evaluate_agent_trajectory(
|
|
|
|
|
def _evaluate_agent_trajectory(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
input: str,
|
|
|
|
|
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
|
|
|
|
agent_trajectory: Sequence[Tuple[AgentAction, str]],
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
callbacks: Callbacks = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
@ -321,11 +324,12 @@ The following is the expected answer. Use this to measure correctness:
|
|
|
|
|
"""Evaluate a trajectory.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input (str): The input question.
|
|
|
|
|
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
|
|
|
|
prediction (str): The final predicted response.
|
|
|
|
|
input (str): The input to the agent.
|
|
|
|
|
agent_trajectory (List[Tuple[AgentAction, str]]):
|
|
|
|
|
The intermediate steps forming the agent trajectory.
|
|
|
|
|
prediction (str): The expected prediction.
|
|
|
|
|
reference (Optional[str]): The reference answer.
|
|
|
|
|
callbacks (Callbacks): Callbacks to use for this chain run.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: The evaluation result.
|
|
|
|
@ -338,12 +342,12 @@ The following is the expected answer. Use this to measure correctness:
|
|
|
|
|
}
|
|
|
|
|
return self(inputs=inputs, callbacks=callbacks, **kwargs)
|
|
|
|
|
|
|
|
|
|
async def aevaluate_agent_trajectory(
|
|
|
|
|
async def _aevaluate_agent_trajectory(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
input: str,
|
|
|
|
|
agent_trajectory: Union[str, List[Tuple[AgentAction, str]]],
|
|
|
|
|
agent_trajectory: Sequence[Tuple[AgentAction, str]],
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
callbacks: Callbacks = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
@ -351,11 +355,12 @@ The following is the expected answer. Use this to measure correctness:
|
|
|
|
|
"""Asynchronously evaluate a trajectory.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
input (str): The input question.
|
|
|
|
|
agent_trajectory (Union[str, List[Tuple[AgentAction, str]]]):
|
|
|
|
|
prediction (str): The final predicted response.
|
|
|
|
|
input (str): The input to the agent.
|
|
|
|
|
agent_trajectory (List[Tuple[AgentAction, str]]):
|
|
|
|
|
The intermediate steps forming the agent trajectory.
|
|
|
|
|
prediction (str): The expected prediction.
|
|
|
|
|
reference (Optional[str]): The reference answer.
|
|
|
|
|
callbacks (Callbacks): Callbacks to use for this chain run.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: The evaluation result.
|
|
|
|
|