diff --git a/libs/langchain/langchain/evaluation/comparison/eval_chain.py b/libs/langchain/langchain/evaluation/comparison/eval_chain.py index 21f623bf0f..9a63fff38b 100644 --- a/libs/langchain/langchain/evaluation/comparison/eval_chain.py +++ b/libs/langchain/langchain/evaluation/comparison/eval_chain.py @@ -1,12 +1,20 @@ """Base classes for comparing the output of two models.""" from __future__ import annotations +import logging +import re from typing import Any, Dict, List, Optional, Union from langchain.callbacks.manager import Callbacks from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple from langchain.chains.llm import LLMChain -from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE +from langchain.chat_models.azure_openai import AzureChatOpenAI +from langchain.chat_models.openai import ChatOpenAI +from langchain.evaluation.comparison.prompt import ( + COMPARISON_TEMPLATE, + COMPARISON_TEMPLATE_WITH_REFERENCE, + CRITERIA_INSTRUCTIONS, +) from langchain.evaluation.criteria.eval_chain import ( CRITERIA_TYPE, Criteria, @@ -17,6 +25,10 @@ from langchain.pydantic_v1 import Extra, Field from langchain.schema import RUN_KEY, BaseOutputParser from langchain.schema.language_model import BaseLanguageModel +logger = logging.getLogger(__name__) + +_FIND_DOUBLE_BRACKETS = re.compile(r"\[\[(.*?)\]\]") + _SUPPORTED_CRITERIA = { Criteria.CONCISENESS: "Is the submission concise and to the point?", Criteria.RELEVANCE: "Is the submission referring to a real quote from the text?", @@ -112,27 +124,26 @@ class PairwiseStringResultOutputParser(BaseOutputParser[dict]): ValueError: If the verdict is invalid. """ - parsed = text.strip().rsplit("\n", maxsplit=1) - if len(parsed) == 1: - reasoning = "" - verdict = parsed[0] - else: - reasoning, verdict = parsed - verdict = verdict.strip("[").strip("]") - if verdict not in {"A", "B", "C"}: + match = _FIND_DOUBLE_BRACKETS.search(text) + + if match: + verdict = match.group(1) + + if not match or verdict not in {"A", "B", "C"}: raise ValueError( - f"Invalid verdict: {verdict}. " - "Verdict must be one of 'A', 'B', or 'C'." + f"Invalid output: {text}. " + "Output must contain a double bracketed string\ + with the verdict 'A', 'B', or 'C'." ) # C means the models are tied. Return 'None' meaning no preference verdict_ = None if verdict == "C" else verdict score = { "A": 1, "B": 0, - None: 0.5, - }.get(verdict_) + "C": 0.5, + }[verdict] return { - "reasoning": reasoning, + "reasoning": text, "value": verdict_, "score": score, } @@ -225,7 +236,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): """Initialize the PairwiseStringEvalChain from an LLM. Args: - llm (BaseLanguageModel): The LLM to use. + llm (BaseChatModel): The LLM to use (GPT-4 recommended). prompt (PromptTemplate, optional): The prompt to use. **kwargs (Any): Additional keyword arguments. @@ -236,8 +247,17 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): ValueError: If the input variables are not as expected. """ + if not ( + isinstance(llm, (ChatOpenAI, AzureChatOpenAI)) + and llm.model_name.startswith("gpt-4") + ): + logger.warning( + "This chain was only tested with GPT-4. \ +Performance may be significantly worse with other models." + ) + expected_input_vars = {"prediction", "prediction_b", "input", "criteria"} - prompt_ = prompt or PROMPT + prompt_ = prompt or COMPARISON_TEMPLATE.partial(reference="") if expected_input_vars != set(prompt_.input_variables): raise ValueError( f"Input variables should be {expected_input_vars}, " @@ -245,6 +265,7 @@ class PairwiseStringEvalChain(PairwiseStringEvaluator, LLMEvalChain, LLMChain): ) criteria_ = resolve_pairwise_criteria(criteria) criteria_str = "\n".join(f"{k}: {v}" if v else k for k, v in criteria_.items()) + criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else "" return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs) def _prepare_input( @@ -418,7 +439,7 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain): "reference", "criteria", } - prompt_ = prompt or PROMPT_WITH_REFERENCE + prompt_ = prompt or COMPARISON_TEMPLATE_WITH_REFERENCE if expected_input_vars != set(prompt_.input_variables): raise ValueError( f"Input variables should be {expected_input_vars}, " @@ -426,4 +447,5 @@ class LabeledPairwiseStringEvalChain(PairwiseStringEvalChain): ) criteria_ = resolve_pairwise_criteria(criteria) criteria_str = "\n".join(f"{k}: {v}" for k, v in criteria_.items()) + criteria_str = CRITERIA_INSTRUCTIONS + criteria_str if criteria_str else "" return cls(llm=llm, prompt=prompt_.partial(criteria=criteria_str), **kwargs) diff --git a/libs/langchain/langchain/evaluation/comparison/prompt.py b/libs/langchain/langchain/evaluation/comparison/prompt.py index a29e8b4130..bed64a4dc6 100644 --- a/libs/langchain/langchain/evaluation/comparison/prompt.py +++ b/libs/langchain/langchain/evaluation/comparison/prompt.py @@ -5,64 +5,55 @@ and answers the question. The prompt is based on the paper from Zheng, et. al. https://arxiv.org/abs/2306.05685 """ # flake8: noqa -from langchain.prompts import PromptTemplate - -template = """Act as a fair judge and rate the two responses to the question below.\ - Choose the response that best followed the instructions and answered the question.\ - Your assessment should weigh the following criteria: -{criteria}\ - Start by comparing both responses and give a brief rationale.\ - Avoid bias from the order of presentation or response length. -After giving your rationale, make your final decision using this format:\ - "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\ - and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line. - -[QUESTION] -{input} -[/QUESTION] - -[RESPONSE A] -{prediction} -[/RESPONSE A] - -[RESPONSE B] -{prediction_b} -[/RESPONSE B]""" -PROMPT = PromptTemplate( - input_variables=["input", "prediction", "prediction_b", "criteria"], - template=template, +from langchain.prompts.chat import ChatPromptTemplate + +SYSTEM_MESSAGE = 'Please act as an impartial judge and evaluate the quality \ +of the responses provided by two AI assistants to the user question displayed below. \ +You should choose the assistant that follows the user\'s instructions \ +and answers \the user\'s question better. \ +Your evaluation should consider factors such as the \ +helpfulness, relevance, accuracy, depth, creativity, \ +and level of detail of their responses. \ +Begin your evaluation by comparing the two responses and provide a short explanation. \ +Avoid any position biases and ensure that the order in which \ +the responses were presented does not influence your decision. \ +Do not allow the length of the responses to influence your evaluation. \ +Do not favor certain names of the assistants. Be as objective as possible. \ +After providing your explanation, output your final verdict by strictly following \ +this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, \ +and "[[C]]" for a tie.' + +CRITERIA_INSTRUCTIONS = ( + "For this evaluation, you should primarily consider the following criteria:\n" ) -template = """Act as a fair judge and rate the two responses to the question below.\ - Choose the response that best followed the instructions and answered the question.\ - Your assessment should weigh the following criteria: -{criteria}\ - Start by comparing both responses and give a brief rationale.\ - Avoid bias from the order of presentation or response length.\ - Weigh accuracy based on the following ground truth reference\ - answer to the question: - -[REFERENCE] -{reference} -[/REFERENCE] - -After giving your rationale, make your final decision using this format:\ - "[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\ - and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line. - -[QUESTION] -{input} -[/QUESTION] - -[RESPONSE A] -{prediction} -[/RESPONSE A] - -[RESPONSE B] -{prediction_b} -[/RESPONSE B]""" +COMPARISON_TEMPLATE = ChatPromptTemplate.from_messages( + [ + ("system", SYSTEM_MESSAGE), + ( + "human", + "{criteria}[User Question]\n{input}\n\n\ +[The Start of Assistant A's Answer]\n{prediction}\n\ +[The End of Assistant A's Answer]\ +\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\ +[The End of Assistant B's Answer]", + ), + ] +) -PROMPT_WITH_REFERENCE = PromptTemplate( - input_variables=["input", "prediction", "prediction_b", "reference", "criteria"], - template=template, +COMPARISON_TEMPLATE_WITH_REFERENCE = ChatPromptTemplate.from_messages( + [ + ("system", SYSTEM_MESSAGE), + ( + "human", + "{criteria}\n\nTo help you evaluate the responses, \ +here is a reference answer to the user's question:\n\ +{reference}\ +[User Question]\n{input}\n\n\ +[The Start of Assistant A's Answer]\n{prediction}\n\ +[The End of Assistant A's Answer]\ +\n\n[The Start of Assistant B's Answer]\n{prediction_b}\n\ +[The End of Assistant B's Answer]", + ), + ] ) diff --git a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py index 269d3caf2b..0bbfb43b23 100644 --- a/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py +++ b/libs/langchain/tests/unit_tests/evaluation/comparison/test_eval_chain.py @@ -34,7 +34,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None: [[A]]""" got = output_parser.parse(text) want = { - "reasoning": "I like pie better than cake.", + "reasoning": text, "value": "A", "score": 1, } @@ -46,7 +46,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None: [[B]]""" got = output_parser.parse(text) want = { - "reasoning": "I like cake better than pie.", + "reasoning": text, "value": "B", "score": 0, } @@ -58,7 +58,7 @@ def test_PairwiseStringResultOutputParser_parse() -> None: [[C]]""" got = output_parser.parse(text) want = { - "reasoning": "I like cake and pie.", + "reasoning": text, "value": None, "score": 0.5, } @@ -84,7 +84,7 @@ def test_pairwise_string_comparison_chain() -> None: ) assert res["value"] is None assert res["score"] == 0.5 - assert res["reasoning"] == "The values are the same." + assert res["reasoning"] == "The values are the same.\n[[C]]" res = chain.evaluate_string_pairs( prediction="I like pie.", prediction_b="I like pie.",