mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add a Pairwise Comparison Chain (#6703)
Notebook shows preference scoring between two chains and reports wilson score interval + p value I think I'll add the option to insert ground truth labels but doesn't have to be in this PR
This commit is contained in:
parent
2928b080f6
commit
cc60fed3be
447
docs/extras/guides/evaluation/comparisons.ipynb
Normal file
447
docs/extras/guides/evaluation/comparisons.ipynb
Normal file
@ -0,0 +1,447 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Comparing Chain Outputs\n",
|
||||||
|
"\n",
|
||||||
|
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
||||||
|
"\n",
|
||||||
|
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
||||||
|
"\n",
|
||||||
|
"For this evalution, we will need 3 things:\n",
|
||||||
|
"1. An evaluator\n",
|
||||||
|
"2. A dataset of inputs\n",
|
||||||
|
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
||||||
|
"\n",
|
||||||
|
"Then we will aggregate the restults to determine the preferred model.\n",
|
||||||
|
"\n",
|
||||||
|
"### Step 1. Create the Evaluator\n",
|
||||||
|
"\n",
|
||||||
|
"In this example, you will use gpt-4 to select which output is preferred."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Optional if you are tracing the notebook\n",
|
||||||
|
"%env LANGCHAIN_PROJECT=\"Comparing Chain Outputs\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.evaluation.comparison import PairwiseStringEvalChain\n",
|
||||||
|
"\n",
|
||||||
|
"llm = ChatOpenAI(model=\"gpt-4\")\n",
|
||||||
|
"\n",
|
||||||
|
"eval_chain = PairwiseStringEvalChain.from_llm(llm=llm)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Step 2. Select Dataset\n",
|
||||||
|
"\n",
|
||||||
|
"If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
|
||||||
|
"provide more reliable results. We will use some example queries someone might have about how to use langchain here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d852a1884480457292c90d8bd9d4f1e6",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation.loading import load_dataset\n",
|
||||||
|
"\n",
|
||||||
|
"dataset = load_dataset(\"langchain-howto-queries\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Step 3. Define Models to Compare\n",
|
||||||
|
"\n",
|
||||||
|
"We will be comparing two agents in this case."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain import SerpAPIWrapper\n",
|
||||||
|
"from langchain.agents import initialize_agent, Tool\n",
|
||||||
|
"from langchain.agents import AgentType\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize the language model\n",
|
||||||
|
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\" \n",
|
||||||
|
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize the SerpAPIWrapper for search functionality\n",
|
||||||
|
"#Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
||||||
|
"search = SerpAPIWrapper()\n",
|
||||||
|
"\n",
|
||||||
|
"# Define a list of tools offered by the agent\n",
|
||||||
|
"tools = [\n",
|
||||||
|
" Tool(\n",
|
||||||
|
" name=\"Search\",\n",
|
||||||
|
" func=search.run,\n",
|
||||||
|
" coroutine=search.arun,\n",
|
||||||
|
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"functions_agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False)\n",
|
||||||
|
"conversations_agent = initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"list(zip(*[iter(batch_results)]*2)### Step 4. Generate Responses\n",
|
||||||
|
"\n",
|
||||||
|
"We will generate outputs for each of the models before evaluating them."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "b076d6bf6680422aa9082d4bad4d98a3",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/20 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n",
|
||||||
|
"Retrying langchain.chat_models.openai.acompletion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from tqdm.notebook import tqdm\n",
|
||||||
|
"import asyncio\n",
|
||||||
|
"\n",
|
||||||
|
"results = []\n",
|
||||||
|
"agents = [functions_agent, conversations_agent]\n",
|
||||||
|
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
||||||
|
"\n",
|
||||||
|
"# We will only run the first 20 examples of this dataset to speed things up\n",
|
||||||
|
"# This will lead to larger confidence intervals downstream.\n",
|
||||||
|
"batch = []\n",
|
||||||
|
"for example in tqdm(dataset[:20]):\n",
|
||||||
|
" batch.extend([agent.acall(example['inputs']) for agent in agents])\n",
|
||||||
|
" if len(batch) >= concurrency_level:\n",
|
||||||
|
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||||
|
" results.extend(list(zip(*[iter(batch_results)]*2)))\n",
|
||||||
|
" batch = []\n",
|
||||||
|
"if batch:\n",
|
||||||
|
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||||
|
" results.extend(list(zip(*[iter(batch_results)]*2)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 5. Evaluate Pairs\n",
|
||||||
|
"\n",
|
||||||
|
"Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
|
||||||
|
"\n",
|
||||||
|
"Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"def predict_preferences(dataset, results) -> list:\n",
|
||||||
|
" preferences = []\n",
|
||||||
|
"\n",
|
||||||
|
" for example, (res_a, res_b) in zip(dataset, results):\n",
|
||||||
|
" input_ = example['inputs']\n",
|
||||||
|
" # Flip a coin to reduce persistent position bias\n",
|
||||||
|
" if random.random() < 0.5:\n",
|
||||||
|
" pred_a, pred_b = res_a, res_b\n",
|
||||||
|
" a, b = \"a\", \"b\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" pred_a, pred_b = res_b, res_a\n",
|
||||||
|
" a, b = \"b\", \"a\"\n",
|
||||||
|
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||||
|
" output_a=pred_a['output'] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||||
|
" output_b=pred_b['output'] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||||
|
" input=input_\n",
|
||||||
|
" )\n",
|
||||||
|
" if eval_res[\"value\"] == \"A\":\n",
|
||||||
|
" preferences.append(a)\n",
|
||||||
|
" elif eval_res[\"value\"] == \"B\":\n",
|
||||||
|
" preferences.append(b)\n",
|
||||||
|
" else:\n",
|
||||||
|
" preferences.append(None) # No preference\n",
|
||||||
|
" return preferences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"preferences = predict_preferences(dataset, results)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"**Print out the ratio of preferences.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"OpenAI Functions Agent: 90.00%\n",
|
||||||
|
"Structured Chat Agent: 10.00%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"\n",
|
||||||
|
"name_map = {\n",
|
||||||
|
" \"a\": \"OpenAI Functions Agent\",\n",
|
||||||
|
" \"b\": \"Structured Chat Agent\",\n",
|
||||||
|
"}\n",
|
||||||
|
"counts = Counter(preferences)\n",
|
||||||
|
"pref_ratios = {\n",
|
||||||
|
" k: v/len(preferences) for k, v in\n",
|
||||||
|
" counts.items()\n",
|
||||||
|
"}\n",
|
||||||
|
"for k, v in pref_ratios.items():\n",
|
||||||
|
" print(f\"{name_map.get(k)}: {v:.2%}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Estimate Confidence Intervals\n",
|
||||||
|
"\n",
|
||||||
|
"The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
|
||||||
|
"\n",
|
||||||
|
"Below, use the Wilson score to estimate the confidence interval."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from math import sqrt\n",
|
||||||
|
"\n",
|
||||||
|
"def wilson_score_interval(preferences: list, which: str = \"a\", z: float = 1.96) -> tuple:\n",
|
||||||
|
" \"\"\"Estimate the confidence interval using the Wilson score.\n",
|
||||||
|
" \n",
|
||||||
|
" See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
|
||||||
|
" for more details, including when to use it and when it should not be used.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" total_preferences = preferences.count('a') + preferences.count('b')\n",
|
||||||
|
" n_s = preferences.count(which)\n",
|
||||||
|
"\n",
|
||||||
|
" if total_preferences == 0:\n",
|
||||||
|
" return (0, 0)\n",
|
||||||
|
"\n",
|
||||||
|
" p_hat = n_s / total_preferences\n",
|
||||||
|
"\n",
|
||||||
|
" denominator = 1 + (z**2) / total_preferences\n",
|
||||||
|
" adjustment = (z / denominator) * sqrt(p_hat*(1-p_hat)/total_preferences + (z**2)/(4*total_preferences*total_preferences))\n",
|
||||||
|
" center = (p_hat + (z**2) / (2*total_preferences)) / denominator\n",
|
||||||
|
" lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
|
||||||
|
" upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
|
||||||
|
"\n",
|
||||||
|
" return (lower_bound, upper_bound)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The \"OpenAI Functions Agent\" would be preferred between 69.90% and 97.21% percent of the time (with 95% confidence).\n",
|
||||||
|
"The \"Structured Chat Agent\" would be preferred between 2.79% and 30.10% percent of the time (with 95% confidence).\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for which_, name in name_map.items():\n",
|
||||||
|
" low, high = wilson_score_interval(preferences, which=which_)\n",
|
||||||
|
" print(f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Print out the p-value.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The p-value is 0.00040. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||||
|
"then there is a 0.04025% chance of observing the OpenAI Functions Agent be preferred at least 18\n",
|
||||||
|
"times out of 20 trials.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from scipy import stats\n",
|
||||||
|
"preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
|
||||||
|
"successes = preferences.count(preferred_model)\n",
|
||||||
|
"n = len(preferences) - preferences.count(None)\n",
|
||||||
|
"p_value = stats.binom_test(successes, n, p=0.5, alternative='two-sided')\n",
|
||||||
|
"print(f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||||
|
"then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
|
||||||
|
"times out of {n} trials.\"\"\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
|
||||||
|
"LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
|
||||||
|
"In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -1 +1,35 @@
|
|||||||
"""[BETA] Functionality relating to evaluation."""
|
"""Functionality relating to evaluation.
|
||||||
|
|
||||||
|
This module contains off-the-shelf evaluation chains for
|
||||||
|
grading the output of LangChain primitives such as LLMs and Chains.
|
||||||
|
|
||||||
|
Some common use cases for evaluation include:
|
||||||
|
|
||||||
|
- Grading accuracy of a response against ground truth answers: QAEvalChain
|
||||||
|
- Comparing the output of two models: PairwiseStringEvalChain
|
||||||
|
- Judging the efficacy of an agent's tool usage: TrajectoryEvalChain
|
||||||
|
- Checking whether an output complies with a set of criteria: CriteriaEvalChain
|
||||||
|
|
||||||
|
This module also contains low level APIs for making more evaluators for your
|
||||||
|
custom evaluation task. These include:
|
||||||
|
- StringEvaluator: Evaluates an output string against a reference and/or
|
||||||
|
with input context.
|
||||||
|
- PairwiseStringEvaluator: Evaluates two strings against each other.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
|
||||||
|
from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||||
|
from langchain.evaluation.criteria.eval_chain import CriteriaEvalChain
|
||||||
|
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
|
||||||
|
from langchain.evaluation.schema import PairwiseStringEvaluator, StringEvaluator
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"PairwiseStringEvalChain",
|
||||||
|
"QAEvalChain",
|
||||||
|
"CotQAEvalChain",
|
||||||
|
"ContextQAEvalChain",
|
||||||
|
"StringEvaluator",
|
||||||
|
"PairwiseStringEvaluator",
|
||||||
|
"TrajectoryEvalChain",
|
||||||
|
"CriteriaEvalChain",
|
||||||
|
]
|
||||||
|
34
langchain/evaluation/comparison/__init__.py
Normal file
34
langchain/evaluation/comparison/__init__.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
"""Comparison evaluators.
|
||||||
|
|
||||||
|
This module contains evaluators for comparing the output of two models,
|
||||||
|
be they LLMs, Chains, or otherwise. This can be used for scoring
|
||||||
|
preferences, measuring similarity / semantic equivalence between outputs,
|
||||||
|
or any other comparison task.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from langchain.chat_models import ChatOpenAI
|
||||||
|
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||||
|
>>> llm = ChatOpenAI(temperature=0)
|
||||||
|
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
|
>>> result = chain.evaluate_string_pairs(
|
||||||
|
... input = "What is the chemical formula for water?",
|
||||||
|
... output_a = "H2O",
|
||||||
|
... output_b = (
|
||||||
|
... "The chemical formula for water is H2O, which means"
|
||||||
|
... " there are two hydrogen atoms and one oxygen atom."
|
||||||
|
... referenc = "The chemical formula for water is H2O.",
|
||||||
|
... )
|
||||||
|
>>> print(result["text"])
|
||||||
|
# {
|
||||||
|
# "value": "B",
|
||||||
|
# "comment": "Both responses accurately state"
|
||||||
|
# " that the chemical formula for water is H2O."
|
||||||
|
# " However, Response B provides additional information"
|
||||||
|
# . " by explaining what the formula means.\n[[B]]"
|
||||||
|
# }
|
||||||
|
"""
|
||||||
|
from langchain.evaluation.comparison.eval_chain import (
|
||||||
|
PairwiseStringEvalChain,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = ["PairwiseStringEvalChain"]
|
205
langchain/evaluation/comparison/eval_chain.py
Normal file
205
langchain/evaluation/comparison/eval_chain.py
Normal file
@ -0,0 +1,205 @@
|
|||||||
|
"""Base classes for comparing the output of two models."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
|
||||||
|
from langchain.base_language import BaseLanguageModel
|
||||||
|
from langchain.callbacks.manager import Callbacks
|
||||||
|
from langchain.chains.llm import LLMChain
|
||||||
|
from langchain.evaluation.comparison.prompt import PROMPT, PROMPT_WITH_REFERENCE
|
||||||
|
from langchain.prompts.prompt import PromptTemplate
|
||||||
|
from langchain.schema import BaseOutputParser
|
||||||
|
|
||||||
|
|
||||||
|
class PairwiseStringResultOutputParser(BaseOutputParser[dict]):
|
||||||
|
"""A parser for the output of the PairwiseStringEvalChain."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _type(self) -> str:
|
||||||
|
return "pairwise_string_result"
|
||||||
|
|
||||||
|
def parse(self, text: str) -> Any:
|
||||||
|
"""Parse the output text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The output text to parse.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Any: The parsed output.
|
||||||
|
"""
|
||||||
|
reasoning, verdict = text.strip().rsplit("\n", maxsplit=1)
|
||||||
|
verdict = verdict.strip("[").strip("]")
|
||||||
|
if verdict not in {"A", "B", "C"}:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid verdict: {verdict}. "
|
||||||
|
"Verdict must be one of 'A', 'B', or 'C'."
|
||||||
|
)
|
||||||
|
# C means the models are tied. Return 'None' meaning no preference
|
||||||
|
verdict_ = None if verdict == "C" else verdict
|
||||||
|
score = {
|
||||||
|
"A": 1,
|
||||||
|
"B": 0,
|
||||||
|
None: 0.5,
|
||||||
|
}.get(verdict_)
|
||||||
|
return {
|
||||||
|
"reasoning": reasoning,
|
||||||
|
"value": verdict_,
|
||||||
|
"score": score,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PairwiseStringEvalChain(LLMChain):
|
||||||
|
"""A chain for comparing the output of two models.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> from langchain.chat_models import ChatOpenAI
|
||||||
|
>>> from langchain.evaluation.comparison import PairwiseStringEvalChain
|
||||||
|
>>> llm = ChatOpenAI(temperature=0)
|
||||||
|
>>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
|
>>> result = chain.evaluate_string_pairs(
|
||||||
|
... input = "What is the chemical formula for water?",
|
||||||
|
... output_a = "H2O",
|
||||||
|
... output_b = (
|
||||||
|
... "The chemical formula for water is H2O, which means"
|
||||||
|
... " there are two hydrogen atoms and one oxygen atom."
|
||||||
|
... referenc = "The chemical formula for water is H2O.",
|
||||||
|
... )
|
||||||
|
>>> print(result["text"])
|
||||||
|
# {
|
||||||
|
# "value": "B",
|
||||||
|
# "comment": "Both responses accurately state"
|
||||||
|
# " that the chemical formula for water is H2O."
|
||||||
|
# " However, Response B provides additional information"
|
||||||
|
# . " by explaining what the formula means.\n[[B]]"
|
||||||
|
# }
|
||||||
|
"""
|
||||||
|
|
||||||
|
output_parser: BaseOutputParser = Field(
|
||||||
|
default_factory=PairwiseStringResultOutputParser
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_llm(
|
||||||
|
cls,
|
||||||
|
*,
|
||||||
|
llm: BaseLanguageModel,
|
||||||
|
prompt: Optional[PromptTemplate] = None,
|
||||||
|
require_reference: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> PairwiseStringEvalChain:
|
||||||
|
"""Initialize the PairwiseStringEvalChain from an LLM.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
llm (BaseLanguageModel): The LLM to use.
|
||||||
|
prompt (PromptTemplate, optional): The prompt to use.
|
||||||
|
require_reference (bool, optional): Whether to require a reference
|
||||||
|
string. Defaults to False.
|
||||||
|
**kwargs (Any): Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.
|
||||||
|
"""
|
||||||
|
expected_input_vars = {"output_a", "output_b", "input"}
|
||||||
|
if prompt is None:
|
||||||
|
if require_reference:
|
||||||
|
expected_input_vars.add("reference")
|
||||||
|
prompt_ = PROMPT_WITH_REFERENCE
|
||||||
|
else:
|
||||||
|
prompt_ = PROMPT
|
||||||
|
else:
|
||||||
|
if require_reference:
|
||||||
|
expected_input_vars.add("reference")
|
||||||
|
prompt_ = prompt
|
||||||
|
|
||||||
|
if expected_input_vars != set(prompt_.input_variables):
|
||||||
|
raise ValueError(
|
||||||
|
f"Input variables should be {expected_input_vars}, "
|
||||||
|
f"but got {prompt_.input_variables}"
|
||||||
|
)
|
||||||
|
return cls(llm=llm, prompt=prompt_, **kwargs)
|
||||||
|
|
||||||
|
def _prepare_input(
|
||||||
|
self, output_a: str, output_b: str, input: str, reference: Optional[str]
|
||||||
|
) -> dict:
|
||||||
|
input_ = {
|
||||||
|
"output_a": output_a,
|
||||||
|
"output_b": output_b,
|
||||||
|
"input": input,
|
||||||
|
}
|
||||||
|
if reference is not None and "reference" in self.prompt.input_variables:
|
||||||
|
input_["reference"] = reference
|
||||||
|
return input_
|
||||||
|
|
||||||
|
def evaluate_string_pairs(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
output_a: str,
|
||||||
|
output_b: str,
|
||||||
|
input: str,
|
||||||
|
reference: Optional[str] = None,
|
||||||
|
callbacks: Callbacks = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""Evaluate whether output A is preferred to output B.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_a (str): The output string from the first model.
|
||||||
|
output_b (str): The output string from the second model.
|
||||||
|
input (str): The input or task string.
|
||||||
|
callbacks (Callbacks, optional): The callbacks to use.
|
||||||
|
reference (str, optional): The reference string, if any.
|
||||||
|
**kwargs (Any): Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary containing:
|
||||||
|
- reasoning: The reasoning for the preference.
|
||||||
|
- value: The preference value, which is either 'A', 'B', or None
|
||||||
|
for no preference.
|
||||||
|
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||||
|
and 0.5 for None.
|
||||||
|
"""
|
||||||
|
input_ = self._prepare_input(output_a, output_b, input, reference)
|
||||||
|
result = self(
|
||||||
|
inputs=input_,
|
||||||
|
callbacks=callbacks,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return result["text"]
|
||||||
|
|
||||||
|
async def aevaluate_string_pairs(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
output_a: str,
|
||||||
|
output_b: str,
|
||||||
|
input: str,
|
||||||
|
reference: Optional[str] = None,
|
||||||
|
callbacks: Callbacks = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""Asynchronously evaluate whether output A is preferred to output B.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_a (str): The output string from the first model.
|
||||||
|
output_b (str): The output string from the second model.
|
||||||
|
input (str): The input or task string.
|
||||||
|
callbacks (Callbacks, optional): The callbacks to use.
|
||||||
|
reference (str, optional): The reference string, if any.
|
||||||
|
**kwargs (Any): Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary containing:
|
||||||
|
- reasoning: The reasoning for the preference.
|
||||||
|
- value: The preference value, which is either 'A', 'B', or None
|
||||||
|
for no preference.
|
||||||
|
- score: The preference score, which is 1 for 'A', 0 for 'B',
|
||||||
|
and 0.5 for None.
|
||||||
|
"""
|
||||||
|
input_ = self._prepare_input(output_a, output_b, input, reference)
|
||||||
|
result = await self.acall(
|
||||||
|
inputs=input_,
|
||||||
|
callbacks=callbacks,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return result["text"]
|
64
langchain/evaluation/comparison/prompt.py
Normal file
64
langchain/evaluation/comparison/prompt.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
"""Prompts for comparing the outputs of two models for a given question.
|
||||||
|
|
||||||
|
This prompt is used to compare two responses and evaluate which one best follows the instructions
|
||||||
|
and answers the question. The prompt is based on the paper from
|
||||||
|
Zheng, et. al. https://arxiv.org/abs/2306.05685
|
||||||
|
"""
|
||||||
|
# flake8: noqa
|
||||||
|
from langchain.prompts import PromptTemplate
|
||||||
|
|
||||||
|
template = """Act as a fair judge and rate the two responses to the question below.\
|
||||||
|
Choose the response that best followed the instructions and answered the question.\
|
||||||
|
Your assessment should weigh helpfulness, relevance, accuracy, depth, creativity, and detail.\
|
||||||
|
Start by comparing both responses and give a brief rationale.\
|
||||||
|
Avoid bias from the order of presentation or response length.
|
||||||
|
After giving your rationale, make your final decision using this format:\
|
||||||
|
"[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
|
||||||
|
and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
|
||||||
|
|
||||||
|
[QUESTION]
|
||||||
|
{input}
|
||||||
|
[/QUESTION]
|
||||||
|
|
||||||
|
[RESPONSE A]
|
||||||
|
{output_a}
|
||||||
|
[/RESPONSE A]
|
||||||
|
|
||||||
|
[RESPONSE B]
|
||||||
|
{output_b}
|
||||||
|
[/RESPONSE B]"""
|
||||||
|
PROMPT = PromptTemplate(
|
||||||
|
input_variables=["input", "output_a", "output_b"], template=template
|
||||||
|
)
|
||||||
|
|
||||||
|
template = """Act as a fair judge and rate the two responses to the question below.\
|
||||||
|
Choose the response that best followed the instructions and answered the question.\
|
||||||
|
Your assessment should weigh helpfulness, relevance, accuracy, depth, creativity, and detail.\
|
||||||
|
Start by comparing both responses and give a brief rationale.\
|
||||||
|
Avoid bias from the order of presentation or response length.\
|
||||||
|
Weigh accuracy based on the following ground truth reference\
|
||||||
|
answer to the question:
|
||||||
|
|
||||||
|
[REFERENCE]
|
||||||
|
{reference}
|
||||||
|
[/REFERENCE]
|
||||||
|
|
||||||
|
After giving your rationale, make your final decision using this format:\
|
||||||
|
"[[A]]" if assistant A is better, "[[B]]" if assistant B is better,\
|
||||||
|
and "[[C]]" for a tie. Finally, repeat the decision again on its own on a new line.
|
||||||
|
|
||||||
|
[QUESTION]
|
||||||
|
{input}
|
||||||
|
[/QUESTION]
|
||||||
|
|
||||||
|
[RESPONSE A]
|
||||||
|
{output_a}
|
||||||
|
[/RESPONSE A]
|
||||||
|
|
||||||
|
[RESPONSE B]
|
||||||
|
{output_b}
|
||||||
|
[/RESPONSE B]"""
|
||||||
|
|
||||||
|
PROMPT_WITH_REFERENCE = PromptTemplate(
|
||||||
|
input_variables=["input", "output_a", "output_b", "reference"], template=template
|
||||||
|
)
|
@ -14,7 +14,7 @@ class StringEvaluator(Protocol):
|
|||||||
prediction: str,
|
prediction: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
input: Optional[str] = None,
|
input: Optional[str] = None,
|
||||||
**kwargs: Any
|
**kwargs: Any,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Evaluate Chain or LLM output, based on optional input and label.
|
"""Evaluate Chain or LLM output, based on optional input and label.
|
||||||
|
|
||||||
@ -34,7 +34,7 @@ class StringEvaluator(Protocol):
|
|||||||
prediction: str,
|
prediction: str,
|
||||||
reference: Optional[str] = None,
|
reference: Optional[str] = None,
|
||||||
input: Optional[str] = None,
|
input: Optional[str] = None,
|
||||||
**kwargs: Any
|
**kwargs: Any,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Asynchronously evaluate Chain or LLM output, based on optional
|
"""Asynchronously evaluate Chain or LLM output, based on optional
|
||||||
input and label.
|
input and label.
|
||||||
@ -48,6 +48,66 @@ class StringEvaluator(Protocol):
|
|||||||
Returns:
|
Returns:
|
||||||
dict: The evaluation results containing the score or value.
|
dict: The evaluation results containing the score or value.
|
||||||
"""
|
"""
|
||||||
return self.evaluate_strings(
|
raise NotImplementedError(
|
||||||
prediction=prediction, reference=reference, input=input, **kwargs
|
f"{self.__class__.__name__} hasn't implemented an "
|
||||||
|
"async aevaluate_strings method."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class PairwiseStringEvaluator(Protocol):
|
||||||
|
"""A protocol for comparing the output of two models."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def evaluate_string_pairs(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
output_a: str,
|
||||||
|
output_b: str,
|
||||||
|
reference: Optional[str] = None,
|
||||||
|
input: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""Evaluate the output string pairs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_a (str): The output string from the first model.
|
||||||
|
output_b (str): The output string from the second model.
|
||||||
|
reference (str, optional): The expected output / reference
|
||||||
|
string. Defaults to None.
|
||||||
|
input (str, optional): The input string. Defaults to None.
|
||||||
|
**kwargs (Any): Additional keyword arguments, such
|
||||||
|
as callbacks and optional reference strings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary containing the preference, scores, and/or
|
||||||
|
other information.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def aevaluate_string_pairs(
|
||||||
|
self,
|
||||||
|
output_a: str,
|
||||||
|
output_b: str,
|
||||||
|
reference: Optional[str] = None,
|
||||||
|
input: Optional[str] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""Evaluate the output string pairs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_a (str): The output string from the first model.
|
||||||
|
output_b (str): The output string from the second model.
|
||||||
|
reference (str, optional): The expected output / reference
|
||||||
|
string. Defaults to None.
|
||||||
|
input (str, optional): The input string. Defaults to None.
|
||||||
|
**kwargs (Any): Additional keyword arguments, such
|
||||||
|
as callbacks and optional reference strings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: A dictionary containing the preference, scores, and/or
|
||||||
|
other information.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{self.__class__.__name__} hasn't implemented an async "
|
||||||
|
"aevaluate_string_pairs method."
|
||||||
)
|
)
|
||||||
|
0
tests/unit_tests/evaluation/comparison/__init__.py
Normal file
0
tests/unit_tests/evaluation/comparison/__init__.py
Normal file
39
tests/unit_tests/evaluation/comparison/test_eval_chain.py
Normal file
39
tests/unit_tests/evaluation/comparison/test_eval_chain.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
"""Test the comparison chains."""
|
||||||
|
|
||||||
|
|
||||||
|
from langchain.evaluation.comparison.eval_chain import PairwiseStringEvalChain
|
||||||
|
from tests.unit_tests.llms.fake_llm import FakeLLM
|
||||||
|
|
||||||
|
|
||||||
|
def test_pairwise_string_comparison_chain() -> None:
|
||||||
|
llm = FakeLLM(
|
||||||
|
queries={
|
||||||
|
"a": "The values are the same.\n[[C]]",
|
||||||
|
"b": "A is clearly better than b.\n[[A]]",
|
||||||
|
"c": "B is clearly better than a.\n[[B]]",
|
||||||
|
},
|
||||||
|
sequential_responses=True,
|
||||||
|
)
|
||||||
|
chain = PairwiseStringEvalChain.from_llm(llm=llm)
|
||||||
|
res = chain.evaluate_string_pairs(
|
||||||
|
output_a="I like pie.",
|
||||||
|
output_b="I love pie.",
|
||||||
|
input="What is your favorite food?",
|
||||||
|
)
|
||||||
|
assert res["value"] is None
|
||||||
|
assert res["score"] == 0.5
|
||||||
|
assert res["reasoning"] == "The values are the same."
|
||||||
|
res = chain.evaluate_string_pairs(
|
||||||
|
output_a="I like pie.",
|
||||||
|
output_b="I like pie.",
|
||||||
|
input="What is your favorite food?",
|
||||||
|
)
|
||||||
|
assert res["value"] == "A"
|
||||||
|
assert res["score"] == 1
|
||||||
|
res = chain.evaluate_string_pairs(
|
||||||
|
output_a="I like pie.",
|
||||||
|
output_b="I hate pie.",
|
||||||
|
input="What is your favorite food?",
|
||||||
|
)
|
||||||
|
assert res["value"] == "B"
|
||||||
|
assert res["score"] == 0
|
Loading…
Reference in New Issue
Block a user