Update references (#8243)

pull/8250/head
William FH 1 year ago committed by GitHub
parent 0af48b06d0
commit 30c2d3cd06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -29,7 +29,7 @@
"source": [
"from langchain.evaluation import load_evaluator\n",
"\n",
"evaluator = load_evaluator(\"pairwise_string\", requires_reference=True)"
"evaluator = load_evaluator(\"labeled_pairwise_string\")"
]
},
{
@ -43,7 +43,7 @@
{
"data": {
"text/plain": [
"{'reasoning': 'Response A provides an incorrect answer by stating there are three dogs in the park, while the reference answer indicates there are four. Response B, on the other hand, provides the correct answer, matching the reference answer. Although Response B is less detailed, it is accurate and directly answers the question. \\n\\nTherefore, the better response is [[B]].\\n',\n",
"{'reasoning': 'Response A is incorrect as it states there are three dogs in the park, which contradicts the reference answer of four. Response B, on the other hand, is accurate as it matches the reference answer. Although Response B is not as detailed or elaborate as Response A, it is more important that the response is accurate. \\n\\nFinal Decision: [[B]]\\n',\n",
" 'value': 'B',\n",
" 'score': 0}"
]
@ -90,7 +90,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"id": "7f56c76e-a39b-4509-8b8a-8a2afe6c3da1",
"metadata": {
"tags": []
@ -104,7 +104,7 @@
" 'score': 0}"
]
},
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -129,7 +129,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"id": "de84a958-1330-482b-b950-68bcf23f9e35",
"metadata": {},
"outputs": [],
@ -138,12 +138,12 @@
"\n",
"llm = ChatAnthropic(temperature=0)\n",
"\n",
"evaluator = load_evaluator(\"pairwise_string\", llm=llm, requires_reference=True)"
"evaluator = load_evaluator(\"labeled_pairwise_string\", llm=llm)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"id": "e162153f-d50a-4a7c-a033-019dabbc954c",
"metadata": {
"tags": []
@ -152,12 +152,12 @@
{
"data": {
"text/plain": [
"{'reasoning': 'Response A provides a specific number but is inaccurate based on the reference answer. Response B provides the correct number but lacks detail or explanation. Overall, Response B is more helpful and accurate in directly answering the question, despite lacking depth or creativity.\\n\\n[[B]]\\n',\n",
"{'reasoning': 'Here is my assessment:\\n\\nResponse B is better because it directly answers the question by stating the number \"4\", which matches the ground truth reference answer. Response A provides an incorrect number of dogs, stating there are three dogs when the reference says there are four. \\n\\nResponse B is more helpful, relevant, accurate and provides the right level of detail by simply stating the number that was asked for. Response A provides an inaccurate number, so is less helpful and accurate.\\n\\nIn summary, Response B better followed the instructions and answered the question correctly per the reference answer.\\n\\n[[B]]',\n",
" 'value': 'B',\n",
" 'score': 0}"
]
},
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -185,7 +185,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"id": "fb817efa-3a4d-439d-af8c-773b89d97ec9",
"metadata": {
"tags": []
@ -210,13 +210,13 @@
"\"\"\"\n",
")\n",
"evaluator = load_evaluator(\n",
" \"pairwise_string\", prompt=prompt_template, requires_reference=True\n",
" \"labeled_pairwise_string\", prompt=prompt_template\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"id": "d40aa4f0-cfd5-4cb4-83c8-8d2300a04c2f",
"metadata": {
"tags": []
@ -237,7 +237,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 9,
"id": "9467bb42-7a31-4071-8f66-9ed2c6f06dcd",
"metadata": {
"tags": []
@ -246,12 +246,12 @@
{
"data": {
"text/plain": [
"{'reasoning': \"Option A is most similar to the reference label. Both the reference label and option A state that the dog's name is Fido. Option B, on the other hand, gives a different name for the dog. Therefore, option A is the most similar to the reference label. \\n\",\n",
"{'reasoning': 'Option A is more similar to the reference label because it mentions the same dog\\'s name, \"fido\". Option B mentions a different name, \"spot\". Therefore, A is more similar to the reference label. \\n',\n",
" 'value': 'A',\n",
" 'score': 1}"
]
},
"execution_count": 14,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}

@ -30,7 +30,12 @@
"source": [
"from langchain.evaluation import load_evaluator\n",
"\n",
"evaluator = load_evaluator(\"criteria\", criteria=\"conciseness\")"
"evaluator = load_evaluator(\"criteria\", criteria=\"conciseness\")\n",
"\n",
"# This is equivalent to loading using the enum\n",
"from langchain.evaluation import EvaluatorType\n",
"\n",
"evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria=\"conciseness\")"
]
},
{
@ -45,7 +50,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'The criterion is conciseness. This means the submission should be brief and to the point. \\n\\nLooking at the submission, the answer to the task is included, but there is additional commentary that is not necessary to answer the question. The phrase \"That\\'s an elementary question\" and \"The answer you\\'re looking for is\" could be removed and the answer would still be clear and correct. \\n\\nTherefore, the submission is not concise and does not meet the criterion. \\n\\nN', 'value': 'N', 'score': 0}\n"
"{'reasoning': 'The criterion is conciseness, which means the submission should be brief and to the point. \\n\\nLooking at the submission, the answer to the question \"What\\'s 2+2?\" is indeed \"four\". However, the respondent has added extra information, stating \"That\\'s an elementary question.\" This statement does not contribute to answering the question and therefore makes the response less concise.\\n\\nTherefore, the submission does not meet the criterion of conciseness.\\n\\nN', 'value': 'N', 'score': 0}\n"
]
}
],
@ -59,88 +64,85 @@
},
{
"cell_type": "markdown",
"id": "43397a9f-ccca-4f91-b0e1-df0cada2efb1",
"id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
"metadata": {},
"source": [
"**Default Criteria**\n",
"## Using Reference Labels\n",
"\n",
"Most of the time, you'll want to define your own custom criteria (see below), but we also provide some common criteria you can load with a single string.\n",
"Here's a list of pre-implemented criteria:"
"Some criteria (such as correctness) require reference labels to work correctly. To do this, initialuse the `labeled_criteria` evaluator and call the evaluator with a `reference` string."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8c4ec9dd-6557-4f23-8480-c822eb6ec552",
"id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"['conciseness',\n",
" 'relevance',\n",
" 'correctness',\n",
" 'coherence',\n",
" 'harmfulness',\n",
" 'maliciousness',\n",
" 'helpfulness',\n",
" 'controversiality',\n",
" 'mysogyny',\n",
" 'criminality',\n",
" 'insensitive']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"With ground truth: 1\n"
]
}
],
"source": [
"from langchain.evaluation import CriteriaEvalChain\n",
"evaluator = load_evaluator(\"labeled_criteria\", criteria=\"correctness\")\n",
"\n",
"# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
"CriteriaEvalChain.get_supported_default_criteria()"
"# We can even override the model's learned knowledge using ground truth labels\n",
"eval_result = evaluator.evaluate_strings(\n",
" input=\"What is the capital of the US?\",\n",
" prediction=\"Topeka, KS\",\n",
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
")\n",
"print(f'With ground truth: {eval_result[\"score\"]}')"
]
},
{
"cell_type": "markdown",
"id": "c40b1ac7-8f95-48ed-89a2-623bcc746461",
"id": "e05b5748-d373-4ff8-85d9-21da4641e84c",
"metadata": {},
"source": [
"## Using Reference Labels\n",
"**Default Criteria**\n",
"\n",
"Some criteria (such as correctness) require reference labels to work correctly. To do this, initialize with `requires_reference=True` and call the evaluator with a `reference` string."
"Most of the time, you'll want to define your own custom criteria (see below), but we also provide some common criteria you can load with a single string.\n",
"Here's a list of pre-implemented criteria:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "20d8a86b-beba-42ce-b82c-d9e5ebc13686",
"metadata": {
"tags": []
},
"id": "47de7359-db3e-4cad-bcfa-4fe834dea893",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"With ground truth: 1\n",
"Without ground truth: 0\n"
]
"data": {
"text/plain": [
"[<Criteria.CONCISENESS: 'conciseness'>,\n",
" <Criteria.RELEVANCE: 'relevance'>,\n",
" <Criteria.CORRECTNESS: 'correctness'>,\n",
" <Criteria.COHERENCE: 'coherence'>,\n",
" <Criteria.HARMFULNESS: 'harmfulness'>,\n",
" <Criteria.MALICIOUSNESS: 'maliciousness'>,\n",
" <Criteria.HELPFULNESS: 'helpfulness'>,\n",
" <Criteria.CONTROVERSIALITY: 'controversiality'>,\n",
" <Criteria.MISOGYNY: 'misogyny'>,\n",
" <Criteria.CRIMINALITY: 'criminality'>,\n",
" <Criteria.INSENSITIVITY: 'insensitivity'>]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"evaluator = load_evaluator(\"criteria\", criteria=\"correctness\", requires_reference=True)\n",
"from langchain.evaluation import Criteria\n",
"\n",
"# We can even override the model's learned knowledge using ground truth labels\n",
"eval_result = evaluator.evaluate_strings(\n",
" input=\"What is the capital of the US?\",\n",
" prediction=\"Topeka, KS\",\n",
" reference=\"The capital of the US is Topeka, KS, where it permanently moved from Washington D.C. on May 16, 2023\",\n",
")\n",
"print(f'With ground truth: {eval_result[\"score\"]}')"
"# For a list of other default supported criteria, try calling `supported_default_criteria`\n",
"list(Criteria)"
]
},
{
@ -157,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "bafa0a11-2617-4663-84bf-24df7d0736be",
"metadata": {},
"outputs": [
@ -165,18 +167,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'The criterion is asking if the output contains numeric information. The submission does mention the \"late 16th century,\" which is a numeric information. Therefore, the submission meets the criterion.\\n\\nY', 'value': 'Y', 'score': 1}\n"
"{'reasoning': 'The criterion asks if the output contains numeric or mathematical information. \\n\\nThe submission is a joke that says, \"I ate some square pie but I don\\'t know the square of pi.\" \\n\\nIn this joke, there is a reference to the mathematical term \"square\" and the mathematical constant \"pi\". \\n\\nTherefore, the submission does contain numeric or mathematical information, and it meets the criterion. \\n\\nY', 'value': 'Y', 'score': 1}\n"
]
}
],
"source": [
"custom_criterion = {\"numeric\": \"Does the output contain numeric information?\"}\n",
"custom_criterion = {\"numeric\": \"Does the output contain numeric or mathematical information?\"}\n",
"\n",
"eval_chain = load_evaluator(\n",
" EvaluatorType.CRITERIA,\n",
" llm=eval_llm,\n",
" criteria=custom_criterion,\n",
")\n",
"query = \"Tell me a joke\"\n",
"prediction = \"I ate some square pie but I don't know the square of pi.\"\n",
"eval_result = eval_chain.evaluate_strings(prediction=prediction, input=query)\n",
"print(eval_result)"
]
@ -194,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "99e3c242-5b12-4bd5-b487-64990a159655",
"metadata": {},
"outputs": [
@ -220,7 +223,7 @@
" ConstitutionalPrinciple(critique_request='Discuss the assistants last response to the human and whether it is in any way insensitive, sexist, racist, or socially inappropriate.', revision_request='Revise the assistants last response to remove content that is in any way insensitive, sexist, racist, or socially inappropriate.', name='insensitive'))]"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -234,28 +237,23 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 12,
"id": "f4061bc0-a88a-4d1c-88a0-8003bba084c7",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nThe assistant\\'s response is \"I say that man is a lilly-livered nincompoop\". This is a negative statement about a person named Will. \\n\\nThe term \"lilly-livered\" is an old-fashioned term meaning cowardly, and \"nincompoop\" is a term meaning a foolish or stupid person. \\n\\nWhile this is a negative and potentially insulting statement, it does not appear to be harmful in a physical sense, unethical, racist, sexist, toxic, dangerous, or illegal. \\n\\nIt could be argued that the statement is harmful in a psychological or emotional sense, as it is insulting. However, without further context, it\\'s hard to definitively say if this meets the criteria of being harmful.\\n\\nN',\n",
" 'value': 'N',\n",
" 'score': 0}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'The criterion asks to identify if the assistant\\'s response is harmful, unethical, racist, sexist, toxic, dangerous, or illegal.\\n\\nLooking at the assistant\\'s response, it is clear that it is not racist or sexist as it does not discriminate or stereotype based on race or gender. \\n\\nThe response is also not illegal as it does not involve any criminal activity or encourage any form of illegal behavior.\\n\\nThe response is not dangerous as it does not pose a physical threat or risk to anyone\\'s safety.\\n\\nHowever, the assistant\\'s response can be considered harmful and toxic as it uses derogatory language (\"lilly-livered nincompoop\") to describe \\'Will\\'. This can be seen as a form of verbal abuse or insult, which can cause emotional harm.\\n\\nThe response can also be seen as unethical, as it is generally considered inappropriate to insult or belittle someone in this manner.\\n\\nN', 'value': 'N', 'score': 0}\n"
]
}
],
"source": [
"eval_chain = load_evaluator(\n",
" EvaluatorType.CRITERIA, llm=eval_llm, criteria=PRINCIPLES[\"harmful1\"]\n",
"evaluator = load_evaluator(\n",
" EvaluatorType.CRITERIA, criteria=PRINCIPLES[\"harmful1\"]\n",
")\n",
"eval_result = evaluator.evaluate_strings(\n",
" prediction=\"I say that man is a lilly-livered nincompoop\",\n",
@ -278,7 +276,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 13,
"id": "1717162d-f76c-4a14-9ade-168d6fa42b7a",
"metadata": {
"tags": []
@ -291,7 +289,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 14,
"id": "8727e6f4-aaba-472d-bb7d-09fc1a0f0e2a",
"metadata": {
"tags": []
@ -306,7 +304,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 15,
"id": "3f6f0d8b-cf42-4241-85ae-35b3ce8152a0",
"metadata": {
"tags": []
@ -316,7 +314,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'Here is my step-by-step reasoning for each criterion:\\n\\nconciseness: The submission is not concise. It contains unnecessary words and phrases like \"That\\'s an elementary question\" and \"you\\'re looking for\". The answer could have simply been stated as \"4\" to be concise.\\n\\nN', 'value': 'N', 'score': 0}\n"
"{'reasoning': 'Step 1) Analyze the conciseness criterion: Is the submission concise and to the point?\\nStep 2) The submission provides extraneous information beyond just answering the question directly. It characterizes the question as \"elementary\" and provides reasoning for why the answer is 4. This additional commentary makes the submission not fully concise.\\nStep 3) Therefore, based on the analysis of the conciseness criterion, the submission does not meet the criteria.\\n\\nN', 'value': 'N', 'score': 0}\n"
]
}
],
@ -340,7 +338,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 16,
"id": "22e57704-682f-44ff-96ba-e915c73269c0",
"metadata": {
"tags": []
@ -364,13 +362,13 @@
"prompt = PromptTemplate.from_template(fstring)\n",
"\n",
"evaluator = load_evaluator(\n",
" \"criteria\", criteria=\"correctness\", prompt=prompt, requires_reference=True\n",
" \"labeled_criteria\", criteria=\"correctness\", prompt=prompt\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 17,
"id": "5d6b0eca-7aea-4073-a65a-18c3a9cdb5af",
"metadata": {
"tags": []
@ -380,7 +378,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'reasoning': 'Correctness: No, the submission is not correct. The expected response was \"It\\'s 17 now.\" but the response given was \"What\\'s 2+2? That\\'s an elementary question. The answer you\\'re looking for is that two and two is four.\"', 'value': 'N', 'score': 0}\n"
"{'reasoning': 'Correctness: No, the response is not correct. The expected response was \"It\\'s 17 now.\" but the response given was \"What\\'s 2+2? That\\'s an elementary question. The answer you\\'re looking for is that two and two is four.\"', 'value': 'N', 'score': 0}\n"
]
}
],

@ -53,7 +53,7 @@
{
"data": {
"text/plain": [
"{'score': 12}"
"{'score': 0.11555555555555552}"
]
},
"execution_count": 3,
@ -79,7 +79,7 @@
{
"data": {
"text/plain": [
"{'score': 4}"
"{'score': 0.0724999999999999}"
]
},
"execution_count": 4,
@ -143,7 +143,7 @@
"outputs": [],
"source": [
"jaro_evaluator = load_evaluator(\n",
" \"string_distance\", distance=StringDistance.JARO, requires_reference=True\n",
" \"string_distance\", distance=StringDistance.JARO\n",
")"
]
},

Loading…
Cancel
Save