From aab2a7cd4b7849e5dc465e14a192a58549394bd1 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Thu, 13 Jul 2023 09:58:28 -0700 Subject: [PATCH] Normalize Trajectory Eval Score (#7668) --- langchain/evaluation/__init__.py | 2 +- .../evaluation/agents/trajectory_eval_chain.py | 16 ++++++++++------ .../evaluation/agents/test_eval_chain.py | 12 ++++++------ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/langchain/evaluation/__init__.py b/langchain/evaluation/__init__.py index e675347572..ac5d2ace97 100644 --- a/langchain/evaluation/__init__.py +++ b/langchain/evaluation/__init__.py @@ -37,7 +37,7 @@ name of the dataset to load. - Grading the accuracy of a response against ground truth answers: :class:`QAEvalChain ` - Comparing the output of two models: :class:`PairwiseStringEvalChain ` or :class:`LabeledPairwiseStringEvalChain ` when there is additionally a reference label. - Judging the efficacy of an agent's tool usage: :class:`TrajectoryEvalChain ` -- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain ` +- Checking whether an output complies with a set of criteria: :class:`CriteriaEvalChain ` or :class:`LabeledCriteriaEvalChain ` when there is additionally a reference label. - Computing semantic difference between a prediction and reference: :class:`EmbeddingDistanceEvalChain ` or between two predictions: :class:`PairwiseEmbeddingDistanceEvalChain ` - Measuring the string distance between a prediction and reference :class:`StringDistanceEvalChain ` or between two predictions :class:`PairwiseStringDistanceEvalChain ` diff --git a/langchain/evaluation/agents/trajectory_eval_chain.py b/langchain/evaluation/agents/trajectory_eval_chain.py index 999e603f6f..39704b3f86 100644 --- a/langchain/evaluation/agents/trajectory_eval_chain.py +++ b/langchain/evaluation/agents/trajectory_eval_chain.py @@ -27,8 +27,12 @@ from langchain.tools.base import BaseTool class TrajectoryEval(NamedTuple): - score: int + """A named tuple containing the score and reasoning for a trajectory.""" + + score: float + """The score for the trajectory, normalized from 0 to 1.s""" reasoning: str + """The reasoning for the score.""" class TrajectoryOutputParser(BaseOutputParser): @@ -43,11 +47,11 @@ class TrajectoryOutputParser(BaseOutputParser): text (str): The output text to parse. Returns: - TrajectoryEval: A named tuple containing the score and reasoning. + TrajectoryEval: A named tuple containing the normalized score and reasoning. Raises: OutputParserException: If the score is not found in the output text or - if the score is not a digit in the range 1-5. + if the LLM's score is not a digit in the range 1-5. """ if "Score:" not in text: raise OutputParserException( @@ -66,8 +70,8 @@ class TrajectoryOutputParser(BaseOutputParser): raise OutputParserException( f"Score is not a digit in the range 1-5: {text}" ) - - return TrajectoryEval(score=int(score_str), reasoning=reasoning) + normalized_score = (int(score_str) - 1) / 4 + return TrajectoryEval(score=normalized_score, reasoning=reasoning) class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain): @@ -90,7 +94,7 @@ class TrajectoryEvalChain(AgentTrajectoryEvaluator, LLMEvalChain): \"\"\"Very helpful answers to geography questions.\"\"\" return f"{country}? IDK - We may never know {question}." - llm = ChatOpenAI(model="gpt-3.5-turbo-0613", temperature=0) + llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) agent = initialize_agent( tools=[geography_answers], llm=llm, diff --git a/tests/unit_tests/evaluation/agents/test_eval_chain.py b/tests/unit_tests/evaluation/agents/test_eval_chain.py index c8c84ae574..847868f195 100644 --- a/tests/unit_tests/evaluation/agents/test_eval_chain.py +++ b/tests/unit_tests/evaluation/agents/test_eval_chain.py @@ -70,7 +70,7 @@ def test_trajectory_eval_chain( agent_trajectory=intermediate_steps, prediction="I like pie.", ) - assert res["score"] == 5 + assert res["score"] == 1.0 # Test when ref is provided res = chain.evaluate_agent_trajectory( input="What is your favorite food?", @@ -78,7 +78,7 @@ def test_trajectory_eval_chain( prediction="I like pie.", reference="Paris", ) - assert res["score"] == 1 + assert res["score"] == 0.0 def test_trajectory_eval_chain_no_tools( @@ -97,14 +97,14 @@ def test_trajectory_eval_chain_no_tools( agent_trajectory=intermediate_steps, prediction="I like pie.", ) - assert res["score"] == 5 + assert res["score"] == 1.0 res = chain.evaluate_agent_trajectory( input="What is your favorite food?", agent_trajectory=intermediate_steps, prediction="I like pie.", reference="Paris", ) - assert res["score"] == 1 + assert res["score"] == 0.0 def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> None: @@ -123,7 +123,7 @@ def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> Non "answer": "I like pie.", } ) - assert res["score"] == 5 + assert res["score"] == 1.0 res = chain( { @@ -133,4 +133,4 @@ def test_old_api_works(intermediate_steps: List[Tuple[AgentAction, str]]) -> Non "reference": "Paris", } ) - assert res["score"] == 1 + assert res["score"] == 0.0