Make eval output parsers more robust (#10658)

Ran through a few hundred generations with some models to fix up the
parsers
pull/5628/head^2
William FH 1 year ago committed by GitHub
parent 3992c1ae9b
commit a3e5507faa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -2,7 +2,8 @@
from __future__ import annotations
import re
from typing import Any, List, Optional, Sequence
import string
from typing import Any, List, Optional, Sequence, Tuple
from langchain.callbacks.manager import Callbacks
from langchain.chains.llm import LLMChain
@ -14,13 +15,32 @@ from langchain.schema import RUN_KEY
from langchain.schema.language_model import BaseLanguageModel
def _get_score(verdict: str) -> Optional[int]:
match = re.search(r"(?i)(?:grade:\s*)?(correct|incorrect)", verdict)
def _get_score(text: str) -> Optional[Tuple[str, int]]:
match = re.search(r"grade:\s*(correct|incorrect)", text.strip(), re.IGNORECASE)
if match:
if match.group(1).upper() == "CORRECT":
return 1
return "CORRECT", 1
elif match.group(1).upper() == "INCORRECT":
return 0
return "INCORRECT", 0
try:
first_word = (
text.strip().split()[0].translate(str.maketrans("", "", string.punctuation))
)
if first_word.upper() == "CORRECT":
return "CORRECT", 1
elif first_word.upper() == "INCORRECT":
return "INCORRECT", 0
last_word = (
text.strip()
.split()[-1]
.translate(str.maketrans("", "", string.punctuation))
)
if last_word.upper() == "CORRECT":
return "CORRECT", 1
elif last_word.upper() == "INCORRECT":
return "INCORRECT", 0
except IndexError:
pass
return None
@ -33,17 +53,15 @@ def _parse_string_eval_output(text: str) -> dict:
Returns:
Any: The parsed output.
"""
splits = text.strip().rsplit("\n", maxsplit=1)
if len(splits) == 1:
verdict = splits[0]
reasoning = None
reasoning = text.strip()
parsed_scores = _get_score(reasoning)
if parsed_scores is None:
value, score = None, None
else:
reasoning, verdict = splits
reasoning = reasoning.strip()
score = _get_score(verdict)
value, score = parsed_scores
return {
"reasoning": reasoning,
"value": verdict,
"value": value,
"score": score,
}

@ -9,6 +9,7 @@ from langchain.evaluation.qa.eval_chain import (
ContextQAEvalChain,
CotQAEvalChain,
QAEvalChain,
_parse_string_eval_output,
)
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_llm import FakeLLM
@ -67,3 +68,65 @@ def test_returns_expected_results(
prediction="my prediction", reference="my reference", input="my input"
)
assert results["score"] == 1
@pytest.mark.parametrize(
"output,expected",
[
(
""" GRADE: CORRECT
QUESTION: according to the passage, what is the main reason that the author wrote this passage?
STUDENT ANSWER: to explain the importance of washing your hands
TRUE ANSWER: to explain the importance of washing your hands
GRADE:""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
(
""" Here is my step-by-step reasoning to grade the student's answer:
1. The question asks who founded the Roanoke settlement.
2. The context states that the grade incorrect answer is Walter Raleigh.
3. The student's answer is "Sir Walter Raleigh".
4. The student's answer matches the context, which states the answer is Walter Raleigh.
5. The addition of "Sir" in the student's answer does not contradict the context. It provides extra detail about Walter Raleigh's title, but the core answer of Walter Raleigh is still correct.
6. Therefore, the student's answer contains the same factual information as the true answer, so it should be graded as correct.
GRADE: CORRECT""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
(
""" CORRECT
QUESTION: who was the first president of the united states?
STUDENT ANSWER: George Washington
TRUE ANSWER: George Washington was the first president of the United States.
GRADE:""",
{
"value": "CORRECT",
"score": 1,
},
),
(
"""The student's answer is "Regent's Park," which matches the correct answer given in the context. Therefore, the student's answer is CORRECT.""", # noqa: E501
{
"value": "CORRECT",
"score": 1,
},
),
],
)
def test_qa_output_parser(output: str, expected: dict) -> None:
expected["reasoning"] = output.strip()
assert _parse_string_eval_output(output) == expected

Loading…
Cancel
Save