Make eval output parsers more robust (#10658)

Ran through a few hundred generations with some models to fix up the parsers
1 year ago · a3e5507faa
parent 3992c1ae9b
commit a3e5507faa
2 changed files with 94 additions and 13 deletions
--- a/libs/langchain/langchain/evaluation/qa/eval_chain.py
+++ b/libs/langchain/langchain/evaluation/qa/eval_chain.py
@ -2,7 +2,8 @@
 from __future__ import annotations

 import re
-from typing import Any, List, Optional, Sequence
+import string
+from typing import Any, List, Optional, Sequence, Tuple

 from langchain.callbacks.manager import Callbacks
 from langchain.chains.llm import LLMChain
@ -14,13 +15,32 @@ from langchain.schema import RUN_KEY
 from langchain.schema.language_model import BaseLanguageModel


-def _get_score(verdict: str) -> Optional[int]:
-    match = re.search(r"(?i)(?:grade:\s*)?(correct|incorrect)", verdict)
+def _get_score(text: str) -> Optional[Tuple[str, int]]:
+    match = re.search(r"grade:\s*(correct|incorrect)", text.strip(), re.IGNORECASE)
    if match:
        if match.group(1).upper() == "CORRECT":
-            return 1
+            return "CORRECT", 1
        elif match.group(1).upper() == "INCORRECT":
-            return 0
+            return "INCORRECT", 0
+    try:
+        first_word = (
+            text.strip().split()[0].translate(str.maketrans("", "", string.punctuation))
+        )
+        if first_word.upper() == "CORRECT":
+            return "CORRECT", 1
+        elif first_word.upper() == "INCORRECT":
+            return "INCORRECT", 0
+        last_word = (
+            text.strip()
+            .split()[-1]
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        if last_word.upper() == "CORRECT":
+            return "CORRECT", 1
+        elif last_word.upper() == "INCORRECT":
+            return "INCORRECT", 0
+    except IndexError:
+        pass
    return None


@ -33,17 +53,15 @@ def _parse_string_eval_output(text: str) -> dict:
    Returns:
        Any: The parsed output.
    """
-    splits = text.strip().rsplit("\n", maxsplit=1)
-    if len(splits) == 1:
-        verdict = splits[0]
-        reasoning = None
+    reasoning = text.strip()
+    parsed_scores = _get_score(reasoning)
+    if parsed_scores is None:
+        value, score = None, None
    else:
-        reasoning, verdict = splits
-        reasoning = reasoning.strip()
-    score = _get_score(verdict)
+        value, score = parsed_scores
    return {
        "reasoning": reasoning,
-        "value": verdict,
+        "value": value,
        "score": score,
    }

--- a/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py
+++ b/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py
@ -9,6 +9,7 @@ from langchain.evaluation.qa.eval_chain import (
    ContextQAEvalChain,
    CotQAEvalChain,
    QAEvalChain,
+    _parse_string_eval_output,
 )
 from langchain.evaluation.schema import StringEvaluator
 from tests.unit_tests.llms.fake_llm import FakeLLM
@ -67,3 +68,65 @@ def test_returns_expected_results(
        prediction="my prediction", reference="my reference", input="my input"
    )
    assert results["score"] == 1
+
+
+@pytest.mark.parametrize(
+    "output,expected",
+    [
+        (
+            """ GRADE: CORRECT
+
+QUESTION: according to the passage, what is the main reason that the author wrote this passage?
+STUDENT ANSWER: to explain the importance of washing your hands
+TRUE ANSWER: to explain the importance of washing your hands
+GRADE:""",  # noqa: E501
+            {
+                "value": "CORRECT",
+                "score": 1,
+            },
+        ),
+        (
+            """ Here is my step-by-step reasoning to grade the student's answer:
+
+1. The question asks who founded the Roanoke settlement.
+
+2. The context states that the grade incorrect answer is Walter Raleigh. 
+
+3. The student's answer is "Sir Walter Raleigh".
+
+4. The student's answer matches the context, which states the answer is Walter Raleigh. 
+
+5. The addition of "Sir" in the student's answer does not contradict the context. It provides extra detail about Walter Raleigh's title, but the core answer of Walter Raleigh is still correct.
+
+6. Therefore, the student's answer contains the same factual information as the true answer, so it should be graded as correct.
+
+GRADE: CORRECT""",  # noqa: E501
+            {
+                "value": "CORRECT",
+                "score": 1,
+            },
+        ),
+        (
+            """  CORRECT
+
+QUESTION: who was the first president of the united states?
+STUDENT ANSWER: George Washington 
+TRUE ANSWER: George Washington was the first president of the United States.
+GRADE:""",
+            {
+                "value": "CORRECT",
+                "score": 1,
+            },
+        ),
+        (
+            """The student's answer is "Regent's Park," which matches the correct answer given in the context. Therefore, the student's answer is CORRECT.""",  # noqa: E501
+            {
+                "value": "CORRECT",
+                "score": 1,
+            },
+        ),
+    ],
+)
+def test_qa_output_parser(output: str, expected: dict) -> None:
+    expected["reasoning"] = output.strip()
+    assert _parse_string_eval_output(output) == expected