diff --git a/libs/langchain/langchain/evaluation/qa/eval_chain.py b/libs/langchain/langchain/evaluation/qa/eval_chain.py index 9f270b6f12..60264aa6b9 100644 --- a/libs/langchain/langchain/evaluation/qa/eval_chain.py +++ b/libs/langchain/langchain/evaluation/qa/eval_chain.py @@ -2,7 +2,8 @@ from __future__ import annotations import re -from typing import Any, List, Optional, Sequence +import string +from typing import Any, List, Optional, Sequence, Tuple from langchain.callbacks.manager import Callbacks from langchain.chains.llm import LLMChain @@ -14,13 +15,32 @@ from langchain.schema import RUN_KEY from langchain.schema.language_model import BaseLanguageModel -def _get_score(verdict: str) -> Optional[int]: - match = re.search(r"(?i)(?:grade:\s*)?(correct|incorrect)", verdict) +def _get_score(text: str) -> Optional[Tuple[str, int]]: + match = re.search(r"grade:\s*(correct|incorrect)", text.strip(), re.IGNORECASE) if match: if match.group(1).upper() == "CORRECT": - return 1 + return "CORRECT", 1 elif match.group(1).upper() == "INCORRECT": - return 0 + return "INCORRECT", 0 + try: + first_word = ( + text.strip().split()[0].translate(str.maketrans("", "", string.punctuation)) + ) + if first_word.upper() == "CORRECT": + return "CORRECT", 1 + elif first_word.upper() == "INCORRECT": + return "INCORRECT", 0 + last_word = ( + text.strip() + .split()[-1] + .translate(str.maketrans("", "", string.punctuation)) + ) + if last_word.upper() == "CORRECT": + return "CORRECT", 1 + elif last_word.upper() == "INCORRECT": + return "INCORRECT", 0 + except IndexError: + pass return None @@ -33,17 +53,15 @@ def _parse_string_eval_output(text: str) -> dict: Returns: Any: The parsed output. """ - splits = text.strip().rsplit("\n", maxsplit=1) - if len(splits) == 1: - verdict = splits[0] - reasoning = None + reasoning = text.strip() + parsed_scores = _get_score(reasoning) + if parsed_scores is None: + value, score = None, None else: - reasoning, verdict = splits - reasoning = reasoning.strip() - score = _get_score(verdict) + value, score = parsed_scores return { "reasoning": reasoning, - "value": verdict, + "value": value, "score": score, } diff --git a/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py index 200432605e..6a89340077 100644 --- a/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py +++ b/libs/langchain/tests/unit_tests/evaluation/qa/test_eval_chain.py @@ -9,6 +9,7 @@ from langchain.evaluation.qa.eval_chain import ( ContextQAEvalChain, CotQAEvalChain, QAEvalChain, + _parse_string_eval_output, ) from langchain.evaluation.schema import StringEvaluator from tests.unit_tests.llms.fake_llm import FakeLLM @@ -67,3 +68,65 @@ def test_returns_expected_results( prediction="my prediction", reference="my reference", input="my input" ) assert results["score"] == 1 + + +@pytest.mark.parametrize( + "output,expected", + [ + ( + """ GRADE: CORRECT + +QUESTION: according to the passage, what is the main reason that the author wrote this passage? +STUDENT ANSWER: to explain the importance of washing your hands +TRUE ANSWER: to explain the importance of washing your hands +GRADE:""", # noqa: E501 + { + "value": "CORRECT", + "score": 1, + }, + ), + ( + """ Here is my step-by-step reasoning to grade the student's answer: + +1. The question asks who founded the Roanoke settlement. + +2. The context states that the grade incorrect answer is Walter Raleigh. + +3. The student's answer is "Sir Walter Raleigh". + +4. The student's answer matches the context, which states the answer is Walter Raleigh. + +5. The addition of "Sir" in the student's answer does not contradict the context. It provides extra detail about Walter Raleigh's title, but the core answer of Walter Raleigh is still correct. + +6. Therefore, the student's answer contains the same factual information as the true answer, so it should be graded as correct. + +GRADE: CORRECT""", # noqa: E501 + { + "value": "CORRECT", + "score": 1, + }, + ), + ( + """ CORRECT + +QUESTION: who was the first president of the united states? +STUDENT ANSWER: George Washington +TRUE ANSWER: George Washington was the first president of the United States. +GRADE:""", + { + "value": "CORRECT", + "score": 1, + }, + ), + ( + """The student's answer is "Regent's Park," which matches the correct answer given in the context. Therefore, the student's answer is CORRECT.""", # noqa: E501 + { + "value": "CORRECT", + "score": 1, + }, + ), + ], +) +def test_qa_output_parser(output: str, expected: dict) -> None: + expected["reasoning"] = output.strip() + assert _parse_string_eval_output(output) == expected