From 3adb1e12cae290a4fe6500bed04e07219d652e74 Mon Sep 17 00:00:00 2001
From: shibuiwilliam <shibuiyusuke@gmail.com>
Date: Wed, 9 Aug 2023 23:57:18 +0900
Subject: [PATCH] make trajectory eval chain stricter and add unit tests
 (#8909)

- update trajectory eval logic to be stricter
- add tests to trajectory eval chain
---
 .../agents/trajectory_eval_chain.py           | 22 +++++--
 .../evaluation/agents/test_eval_chain.py      | 63 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/libs/langchain/langchain/evaluation/agents/trajectory_eval_chain.py b/libs/langchain/langchain/evaluation/agents/trajectory_eval_chain.py
index 797bed5a99..55d0ac1fbc 100644
--- a/libs/langchain/langchain/evaluation/agents/trajectory_eval_chain.py
+++ b/libs/langchain/langchain/evaluation/agents/trajectory_eval_chain.py
@@ -5,6 +5,7 @@ the sequence of actions taken and their outcomes. It uses a language model
 chain (LLMChain) to generate the reasoning and scores.
 """
 
+import re
 from typing import (
     Any,
     Dict,
@@ -74,15 +75,24 @@ class TrajectoryOutputParser(BaseOutputParser):
 
         reasoning, score_str = reasoning.strip(), score_str.strip()
 
-        score_str = next(
-            (char for char in score_str if char.isdigit()), "0"
-        )  # Scan for first digit
-
-        if not 1 <= int(score_str) <= 5:
+        # Use regex to extract the score.
+        # This will get the number in the string, even if it is a float or more than 10.
+        # E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and
+        # "Score: 10" will return 10.
+        # The score should be an integer digit in the range 1-5.
+        _score = re.search(r"(\d+(\.\d+)?)", score_str)
+        # If the score is not found or is a float, raise an exception.
+        if _score is None or "." in _score.group(1):
+            raise OutputParserException(
+                f"Score is not an integer digit in the range 1-5: {text}"
+            )
+        score = int(_score.group(1))
+        # If the score is not in the range 1-5, raise an exception.
+        if not 1 <= score <= 5:
             raise OutputParserException(
                 f"Score is not a digit in the range 1-5: {text}"
             )
-        normalized_score = (int(score_str) - 1) / 4
+        normalized_score = (score - 1) / 4
         return TrajectoryEval(score=normalized_score, reasoning=reasoning)
 
 
diff --git a/libs/langchain/tests/unit_tests/evaluation/agents/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/agents/test_eval_chain.py
index 847868f195..5acb93baa5 100644
--- a/libs/langchain/tests/unit_tests/evaluation/agents/test_eval_chain.py
+++ b/libs/langchain/tests/unit_tests/evaluation/agents/test_eval_chain.py
@@ -6,8 +6,12 @@ import pytest
 from pydantic import Field
 
 from langchain.callbacks.manager import CallbackManagerForLLMRun
-from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
-from langchain.schema import AgentAction, BaseMessage
+from langchain.evaluation.agents.trajectory_eval_chain import (
+    TrajectoryEval,
+    TrajectoryEvalChain,
+    TrajectoryOutputParser,
+)
+from langchain.schema import AgentAction, BaseMessage, OutputParserException
 from langchain.tools.base import tool
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
 
@@ -53,6 +57,61 @@ class _FakeTrajectoryChatModel(FakeChatModel):
             return self.queries[prompt]
 
 
+def test_trajectory_output_parser_parse() -> None:
+    trajectory_output_parser = TrajectoryOutputParser()
+    text = """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.
+
+Score: 2"""
+    got = trajectory_output_parser.parse(text)
+    want = TrajectoryEval(
+        score=0.25,
+        reasoning="""Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.""",
+    )
+
+    assert got["score"] == want["score"]
+    assert got["reasoning"] == want["reasoning"]
+
+    with pytest.raises(OutputParserException):
+        trajectory_output_parser.parse(
+            """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2."""
+        )
+
+    with pytest.raises(OutputParserException):
+        trajectory_output_parser.parse(
+            """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.
+
+Score: 9"""
+        )
+
+    with pytest.raises(OutputParserException):
+        trajectory_output_parser.parse(
+            """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.
+
+Score: 10"""
+        )
+
+    with pytest.raises(OutputParserException):
+        trajectory_output_parser.parse(
+            """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.
+
+Score: 0.1"""
+        )
+
+    with pytest.raises(OutputParserException):
+        trajectory_output_parser.parse(
+            """Judgment: Given the good reasoning in the final answer
+but otherwise poor performance, we give the model a score of 2.
+
+Score: One"""
+        )
+
+
 def test_trajectory_eval_chain(
     intermediate_steps: List[Tuple[AgentAction, str]]
 ) -> None: