From f421af8b80e04a39c2888e1c77ead1d2e24d960d Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Wed, 20 Sep 2023 11:18:33 -0700
Subject: [PATCH] Criteria Parser Improvements (#10824)

---
 .../evaluation/criteria/eval_chain.py         | 36 ++++++++++---
 .../evaluation/criteria/test_eval_chain.py    | 51 +++++++++++--------
 2 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/libs/langchain/langchain/evaluation/criteria/eval_chain.py b/libs/langchain/langchain/evaluation/criteria/eval_chain.py
index dbd34cb6e8..c351217f43 100644
--- a/libs/langchain/langchain/evaluation/criteria/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/criteria/eval_chain.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import re
 from enum import Enum
 from typing import Any, Dict, List, Mapping, Optional, Union
 
@@ -73,15 +74,36 @@ class CriteriaResultOutputParser(BaseOutputParser[dict]):
         Returns:
             Dict: The parsed output.
         """
-        parsed = text.strip().rsplit("\n", maxsplit=1)
-        if len(parsed) == 1:
-            reasoning = ""
-            verdict = parsed[0]
+        verdict = None
+        score = None
+        match_last = re.search(r"\s*(Y|N)\s*$", text, re.IGNORECASE)
+        match_first = re.search(r"^\s*(Y|N)\s*", text, re.IGNORECASE)
+        match_end = re.search(r"\b(Y|N)\b\s*$", text, re.IGNORECASE)
+
+        if match_last:
+            verdict = match_last.group(1).strip()
+            text = text[: match_last.start()].strip()
+        elif match_first:
+            verdict = match_first.group(1).strip()
+            text = text[match_first.end() :].strip()
+        elif match_end:
+            verdict = match_end.group(1).strip()
+            text = text[: match_end.start()].strip()
         else:
-            reasoning, verdict = parsed
-        score = 1 if verdict.upper() == "Y" else (0 if verdict.upper() == "N" else None)
+            splits = text.strip().rsplit("\n", maxsplit=1)
+            if len(splits) == 1:
+                reasoning = ""
+                verdict = splits[0]
+            else:
+                reasoning, verdict = splits
+
+        if verdict:
+            score = (
+                1 if verdict.upper() == "Y" else (0 if verdict.upper() == "N" else None)
+            )
+
         return {
-            "reasoning": reasoning.strip(),
+            "reasoning": text.strip(),
             "value": verdict,
             "score": score,
         }
diff --git a/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py
index cc1833f42b..f843854aff 100644
--- a/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py
+++ b/libs/langchain/tests/unit_tests/evaluation/criteria/test_eval_chain.py
@@ -24,29 +24,38 @@ def test_resolve_criteria_str() -> None:
     }
 
 
-def test_CriteriaResultOutputParser_parse() -> None:
-    output_parser = CriteriaResultOutputParser()
-    text = """Here is my step-by-step reasoning for the given criteria:
+@pytest.mark.parametrize(
+    "text,want",
+    [
+        ("Y", {"reasoning": "", "value": "Y", "score": 1}),
+        (
+            """Here is my step-by-step reasoning for the given criteria:
 The criterion is: "Do you like cake?" I like cake.
-Y"""
+Y""",
+            {
+                "reasoning": """Here is my step-by-step reasoning for the given criteria:
+The criterion is: "Do you like cake?" I like cake.""",  # noqa: E501
+                "value": "Y",
+                "score": 1,
+            },
+        ),
+        (
+            " NThe submission N is correct, accurate, and factual. It accurately"
+            " identifies the specific effects of knowledge and interest on"
+            " these factors. Therefore, the submission Y meets the criteria. Y",
+            {
+                "reasoning": "NThe submission N is correct, accurate, and factual. It"
+                " accurately identifies the specific effects of knowledge and interest"
+                " on these factors. Therefore, the submission Y meets the criteria.",
+                "value": "Y",
+                "score": 1,
+            },
+        ),
+    ],
+)
+def test_CriteriaResultOutputParser_parse(text: str, want: dict) -> None:
+    output_parser = CriteriaResultOutputParser()
     got = output_parser.parse(text)
-    want = {
-        "reasoning": """Here is my step-by-step reasoning for the given criteria:
-The criterion is: "Do you like cake?" I like cake.""",
-        "value": "Y",
-        "score": 1,
-    }
-    assert got.get("reasoning") == want["reasoning"]
-    assert got.get("value") == want["value"]
-    assert got.get("score") == want["score"]
-
-    text = "Y"
-    got = output_parser.parse(text)
-    want = {
-        "reasoning": "",
-        "value": "Y",
-        "score": 1,
-    }
     assert got.get("reasoning") == want["reasoning"]
     assert got.get("value") == want["value"]
     assert got.get("score") == want["score"]