From bc0af67aaf1ba7a1e1e973a5576c7eac601feecb Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Mon, 19 Jun 2023 21:11:50 -0700 Subject: [PATCH] Add Trajectory Eval RunEvaluator (#6449) --- .../run_evaluators/implementations.py | 148 ++++++++++++++++-- 1 file changed, 136 insertions(+), 12 deletions(-) diff --git a/langchain/evaluation/run_evaluators/implementations.py b/langchain/evaluation/run_evaluators/implementations.py index 6b70d80d..8aea8e9d 100644 --- a/langchain/evaluation/run_evaluators/implementations.py +++ b/langchain/evaluation/run_evaluators/implementations.py @@ -1,11 +1,15 @@ -from typing import Any, Dict, Mapping, Optional, Sequence, Union +from typing import Any, Dict, List, Mapping, Optional, Sequence, Union from langchainplus_sdk.evaluation import EvaluationResult -from langchainplus_sdk.schemas import Example, Run -from pydantic import BaseModel +from langchainplus_sdk.schemas import Example, Run, RunTypeEnum +from pydantic import BaseModel, Field from langchain.base_language import BaseLanguageModel from langchain.chains.llm import LLMChain +from langchain.chat_models.base import BaseChatModel +from langchain.evaluation.agents.trajectory_eval_prompt import ( + EVAL_CHAT_PROMPT as TRAJECTORY_PROMPT, +) from langchain.evaluation.qa.eval_chain import QAEvalChain from langchain.evaluation.qa.eval_prompt import PROMPT as QA_DEFAULT_PROMPT from langchain.evaluation.qa.eval_prompt import SQL_PROMPT @@ -17,7 +21,10 @@ from langchain.evaluation.run_evaluators.base import ( from langchain.evaluation.run_evaluators.criteria_prompt import ( PROMPT as CRITERIA_PROMPT, ) +from langchain.prompts.base import BasePromptTemplate from langchain.prompts.prompt import PromptTemplate +from langchain.schema import OutputParserException +from langchain.tools.base import BaseTool _QA_PROMPTS = { "qa": QA_DEFAULT_PROMPT, @@ -28,18 +35,13 @@ _QA_PROMPTS = { class StringRunEvaluatorInputMapper(RunEvaluatorInputMapper, BaseModel): """Maps the Run and Optional[Example] to a dictionary.""" - prediction_map: Mapping[str, str] + prediction_map: Dict[str, str] """Map from run outputs to the evaluation inputs.""" - input_map: Mapping[str, str] + input_map: Dict[str, str] """Map from run inputs to the evaluation inputs.""" - answer_map: Optional[Mapping[str, str]] = None + answer_map: Optional[Dict[str, str]] = None """Map from example outputs to the evaluation inputs.""" - class Config: - """Pydantic config.""" - - arbitrary_types_allowed = True - def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]: """Maps the Run and Optional[Example] to a dictionary""" if run.outputs is None: @@ -166,7 +168,7 @@ def get_criteria_evaluator( *, input_key: str = "input", prediction_key: str = "output", - prompt: PromptTemplate = CRITERIA_PROMPT, + prompt: BasePromptTemplate = CRITERIA_PROMPT, evaluation_name: Optional[str] = None, **kwargs: Any, ) -> RunEvaluatorChain: @@ -198,3 +200,125 @@ def get_criteria_evaluator( output_parser=parser, **kwargs, ) + + +class TrajectoryEvalOutputParser(RunEvaluatorOutputParser): + evaluation_name: str = "Agent Trajectory" + """The name assigned to the evaluation feedback.""" + evaluator_info: dict = Field(default_factory=dict) + """Additional information to log as feedback metadata.""" + + def parse(self, text: str) -> EvaluationResult: + if "Score:" not in text: + raise OutputParserException( + f"Could not find score in model eval output: {text}" + ) + + reasoning, score_str = text.split("Score: ") + + reasoning, score_str = reasoning.strip(), score_str.strip() + + score_str = next( + (char for char in score_str if char.isdigit()), "0" + ) # Scan for first digit + + if not 1 <= int(score_str) <= 5: + raise OutputParserException( + f"Score is not a digit in the range 1-5: {text}" + ) + + return EvaluationResult( + key=self.evaluation_name, + score=int(score_str), + comment=reasoning, + evaluator_info=self.evaluator_info, + ) + + +class TrajectoryInputMapper(RunEvaluatorInputMapper, BaseModel): + """Maps the Run and Optional[Example] to a dictionary.""" + + tool_descriptions: List[str] + """The descriptions for each of the tools available to the agent.""" + agent_input_key: str = "input" + """The key to load from the agent executor's run input dictionary.""" + agent_output_key: str = "output" + """The key to load from the agent executor's run output dictionary.""" + tool_input_key: str = "input" + """The key to load from the tool executor's run input dictionary.""" + tool_output_key: str = "output" + """The key to load from the tool executor's run output dictionary.""" + + def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, str]: + """Maps the Run and Optional[Example] to a dictionary""" + if run.child_runs is None: + raise ValueError("Run must have child runs to be evaluated.") + if run.outputs is None: + raise ValueError("Run must have outputs to be evaluated.") + question = run.inputs[self.agent_input_key] + tool_runs = [ + run_ for run_ in run.child_runs if run_.run_type == RunTypeEnum.tool + ] + agent_steps = [] + for i, run_ in enumerate(tool_runs, 1): + tool_output = ( + f"Tool output: {run_.outputs.get(self.tool_output_key, run_.outputs)}" + if run_.outputs + else (f"Tool error: {run_.error}" if run_.error else "No output") + ) + agent_steps.append( + f"""Step {i}: +Tool used: {run_.name} +Tool input: {run_.inputs.get(self.tool_input_key, run_.inputs)} +Tool output: {tool_output}""" + ) + + return { + "tool_descriptions": "\n\n".join(self.tool_descriptions), + "question": question, + "agent_trajectory": "\n\n".join(agent_steps), + "answer": run.outputs[self.agent_output_key], + } + + +def get_trajectory_evaluator( + llm: BaseChatModel, + agent_tools: Union[Sequence[str], Sequence[BaseTool]], + *, + input_key: str = "input", + prediction_key: str = "output", + tool_input_key: str = "input", + tool_output_key: str = "output", + prompt: BasePromptTemplate = TRAJECTORY_PROMPT, + evaluation_name: str = "Agent Trajectory", + **kwargs: Any, +) -> RunEvaluatorChain: + """Get an eval chain for grading a model's response against a map of criteria.""" + tool_descriptions = [ + f"Tool {i}: {tool.name}\nDescription: {tool.description}" + if isinstance(tool, BaseTool) + else f"Tool {i}: {tool}" + for i, tool in enumerate(agent_tools, 1) + ] + + input_mapper = kwargs.pop( + "input_mapper", + TrajectoryInputMapper( + agent_input_key=input_key, + agent_output_key=prediction_key, + tool_input_key=tool_input_key, + tool_output_key=tool_output_key, + tool_descriptions=tool_descriptions, + ), + ) + parser = kwargs.pop( + "output_parser", + TrajectoryEvalOutputParser(evaluation_name=evaluation_name), + ) + eval_chain = LLMChain(llm=llm, prompt=prompt, **kwargs) + return RunEvaluatorChain( + eval_chain=eval_chain, + input_mapper=input_mapper, + output_parser=parser, + **kwargs, + )