mirror of https://github.com/hwchase17/langchain
Add Exact match and Regex Match Evaluators (#11132)
parent
e355606b11
commit
33da8bd711
@ -1,280 +1,281 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
|
"id": "657d2c8c-54b4-42a3-9f02-bdefa0ed6728",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom Pairwise Evaluator\n",
|
"# Custom Pairwise Evaluator\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/custom.ipynb)\n",
|
||||||
"You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
|
"\n",
|
||||||
"\n",
|
"You can make your own pairwise string evaluators by inheriting from `PairwiseStringEvaluator` class and overwriting the `_evaluate_string_pairs` method (and the `_aevaluate_string_pairs` method if you want to use the evaluator asynchronously).\n",
|
||||||
"In this example, you will make a simple custom evaluator that just returns whether the first prediction has more whitespace tokenized 'words' than the second.\n",
|
"\n",
|
||||||
"\n",
|
"In this example, you will make a simple custom evaluator that just returns whether the first prediction has more whitespace tokenized 'words' than the second.\n",
|
||||||
"You can check out the reference docs for the [PairwiseStringEvaluator interface](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.PairwiseStringEvaluator.html#langchain.evaluation.schema.PairwiseStringEvaluator) for more info.\n"
|
"\n",
|
||||||
]
|
"You can check out the reference docs for the [PairwiseStringEvaluator interface](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.PairwiseStringEvaluator.html#langchain.evaluation.schema.PairwiseStringEvaluator) for more info.\n"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"id": "93f3a653-d198-4291-973c-8d1adba338b2",
|
"execution_count": 1,
|
||||||
"metadata": {
|
"id": "93f3a653-d198-4291-973c-8d1adba338b2",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from typing import Optional, Any\n",
|
"source": [
|
||||||
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
"from typing import Optional, Any\n",
|
||||||
"\n",
|
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class LengthComparisonPairwiseEvalutor(PairwiseStringEvaluator):\n",
|
"\n",
|
||||||
" \"\"\"\n",
|
"class LengthComparisonPairwiseEvalutor(PairwiseStringEvaluator):\n",
|
||||||
" Custom evaluator to compare two strings.\n",
|
" \"\"\"\n",
|
||||||
" \"\"\"\n",
|
" Custom evaluator to compare two strings.\n",
|
||||||
"\n",
|
" \"\"\"\n",
|
||||||
" def _evaluate_string_pairs(\n",
|
"\n",
|
||||||
" self,\n",
|
" def _evaluate_string_pairs(\n",
|
||||||
" *,\n",
|
" self,\n",
|
||||||
" prediction: str,\n",
|
" *,\n",
|
||||||
" prediction_b: str,\n",
|
" prediction: str,\n",
|
||||||
" reference: Optional[str] = None,\n",
|
" prediction_b: str,\n",
|
||||||
" input: Optional[str] = None,\n",
|
" reference: Optional[str] = None,\n",
|
||||||
" **kwargs: Any,\n",
|
" input: Optional[str] = None,\n",
|
||||||
" ) -> dict:\n",
|
" **kwargs: Any,\n",
|
||||||
" score = int(len(prediction.split()) > len(prediction_b.split()))\n",
|
" ) -> dict:\n",
|
||||||
" return {\"score\": score}"
|
" score = int(len(prediction.split()) > len(prediction_b.split()))\n",
|
||||||
]
|
" return {\"score\": score}"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"id": "7d4a77c3-07a7-4076-8e7f-f9bca0d6c290",
|
"execution_count": 2,
|
||||||
"metadata": {
|
"id": "7d4a77c3-07a7-4076-8e7f-f9bca0d6c290",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 1}"
|
"text/plain": [
|
||||||
]
|
"{'score': 1}"
|
||||||
},
|
]
|
||||||
"execution_count": 2,
|
},
|
||||||
"metadata": {},
|
"execution_count": 2,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator = LengthComparisonPairwiseEvalutor()\n",
|
"source": [
|
||||||
"\n",
|
"evaluator = LengthComparisonPairwiseEvalutor()\n",
|
||||||
"evaluator.evaluate_string_pairs(\n",
|
"\n",
|
||||||
" prediction=\"The quick brown fox jumped over the lazy dog.\",\n",
|
"evaluator.evaluate_string_pairs(\n",
|
||||||
" prediction_b=\"The quick brown fox jumped over the dog.\",\n",
|
" prediction=\"The quick brown fox jumped over the lazy dog.\",\n",
|
||||||
")"
|
" prediction_b=\"The quick brown fox jumped over the dog.\",\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"id": "d90f128f-6f49-42a1-b05a-3aea568ee03b",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"id": "d90f128f-6f49-42a1-b05a-3aea568ee03b",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## LLM-Based Example\n",
|
"source": [
|
||||||
"\n",
|
"## LLM-Based Example\n",
|
||||||
"That example was simple to illustrate the API, but it wasn't very useful in practice. Below, use an LLM with some custom instructions to form a simple preference scorer similar to the built-in [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain). We will use `ChatAnthropic` for the evaluator chain."
|
"\n",
|
||||||
]
|
"That example was simple to illustrate the API, but it wasn't very useful in practice. Below, use an LLM with some custom instructions to form a simple preference scorer similar to the built-in [PairwiseStringEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain.html#langchain.evaluation.comparison.eval_chain.PairwiseStringEvalChain). We will use `ChatAnthropic` for the evaluator chain."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"id": "b4b43098-4d96-417b-a8a9-b3e75779cfe8",
|
"execution_count": 3,
|
||||||
"metadata": {
|
"id": "b4b43098-4d96-417b-a8a9-b3e75779cfe8",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"# %pip install anthropic\n",
|
"source": [
|
||||||
"# %env ANTHROPIC_API_KEY=YOUR_API_KEY"
|
"# %pip install anthropic\n",
|
||||||
]
|
"# %env ANTHROPIC_API_KEY=YOUR_API_KEY"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 4,
|
"cell_type": "code",
|
||||||
"id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
|
"execution_count": 4,
|
||||||
"metadata": {
|
"id": "b6e978ab-48f1-47ff-9506-e13b1a50be6e",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from typing import Optional, Any\n",
|
"source": [
|
||||||
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
"from typing import Optional, Any\n",
|
||||||
"from langchain.chat_models import ChatAnthropic\n",
|
"from langchain.evaluation import PairwiseStringEvaluator\n",
|
||||||
"from langchain.chains import LLMChain\n",
|
"from langchain.chat_models import ChatAnthropic\n",
|
||||||
"\n",
|
"from langchain.chains import LLMChain\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class CustomPreferenceEvaluator(PairwiseStringEvaluator):\n",
|
"\n",
|
||||||
" \"\"\"\n",
|
"class CustomPreferenceEvaluator(PairwiseStringEvaluator):\n",
|
||||||
" Custom evaluator to compare two strings using a custom LLMChain.\n",
|
" \"\"\"\n",
|
||||||
" \"\"\"\n",
|
" Custom evaluator to compare two strings using a custom LLMChain.\n",
|
||||||
"\n",
|
" \"\"\"\n",
|
||||||
" def __init__(self) -> None:\n",
|
"\n",
|
||||||
" llm = ChatAnthropic(model=\"claude-2\", temperature=0)\n",
|
" def __init__(self) -> None:\n",
|
||||||
" self.eval_chain = LLMChain.from_string(\n",
|
" llm = ChatAnthropic(model=\"claude-2\", temperature=0)\n",
|
||||||
" llm,\n",
|
" self.eval_chain = LLMChain.from_string(\n",
|
||||||
" \"\"\"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
" llm,\n",
|
||||||
"\n",
|
" \"\"\"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
||||||
"Input: How do I get the path of the parent directory in python 3.8?\n",
|
"\n",
|
||||||
"Option A: You can use the following code:\n",
|
"Input: How do I get the path of the parent directory in python 3.8?\n",
|
||||||
"```python\n",
|
"Option A: You can use the following code:\n",
|
||||||
"import os\n",
|
"```python\n",
|
||||||
"\n",
|
"import os\n",
|
||||||
"os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
|
"\n",
|
||||||
"```\n",
|
"os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
|
||||||
"Option B: You can use the following code:\n",
|
"```\n",
|
||||||
"```python\n",
|
"Option B: You can use the following code:\n",
|
||||||
"from pathlib import Path\n",
|
"```python\n",
|
||||||
"Path(__file__).absolute().parent\n",
|
"from pathlib import Path\n",
|
||||||
"```\n",
|
"Path(__file__).absolute().parent\n",
|
||||||
"Reasoning: Both options return the same result. However, since option B is more concise and easily understand, it is preferred.\n",
|
"```\n",
|
||||||
"Preference: B\n",
|
"Reasoning: Both options return the same result. However, since option B is more concise and easily understand, it is preferred.\n",
|
||||||
"\n",
|
"Preference: B\n",
|
||||||
"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
"\n",
|
||||||
"Input: {input}\n",
|
"Which option is preferred? Do not take order into account. Evaluate based on accuracy and helpfulness. If neither is preferred, respond with C. Provide your reasoning, then finish with Preference: A/B/C\n",
|
||||||
"Option A: {prediction}\n",
|
"Input: {input}\n",
|
||||||
"Option B: {prediction_b}\n",
|
"Option A: {prediction}\n",
|
||||||
"Reasoning:\"\"\",\n",
|
"Option B: {prediction_b}\n",
|
||||||
" )\n",
|
"Reasoning:\"\"\",\n",
|
||||||
"\n",
|
" )\n",
|
||||||
" @property\n",
|
"\n",
|
||||||
" def requires_input(self) -> bool:\n",
|
" @property\n",
|
||||||
" return True\n",
|
" def requires_input(self) -> bool:\n",
|
||||||
"\n",
|
" return True\n",
|
||||||
" @property\n",
|
"\n",
|
||||||
" def requires_reference(self) -> bool:\n",
|
" @property\n",
|
||||||
" return False\n",
|
" def requires_reference(self) -> bool:\n",
|
||||||
"\n",
|
" return False\n",
|
||||||
" def _evaluate_string_pairs(\n",
|
"\n",
|
||||||
" self,\n",
|
" def _evaluate_string_pairs(\n",
|
||||||
" *,\n",
|
" self,\n",
|
||||||
" prediction: str,\n",
|
" *,\n",
|
||||||
" prediction_b: str,\n",
|
" prediction: str,\n",
|
||||||
" reference: Optional[str] = None,\n",
|
" prediction_b: str,\n",
|
||||||
" input: Optional[str] = None,\n",
|
" reference: Optional[str] = None,\n",
|
||||||
" **kwargs: Any,\n",
|
" input: Optional[str] = None,\n",
|
||||||
" ) -> dict:\n",
|
" **kwargs: Any,\n",
|
||||||
" result = self.eval_chain(\n",
|
" ) -> dict:\n",
|
||||||
" {\n",
|
" result = self.eval_chain(\n",
|
||||||
" \"input\": input,\n",
|
" {\n",
|
||||||
" \"prediction\": prediction,\n",
|
" \"input\": input,\n",
|
||||||
" \"prediction_b\": prediction_b,\n",
|
" \"prediction\": prediction,\n",
|
||||||
" \"stop\": [\"Which option is preferred?\"],\n",
|
" \"prediction_b\": prediction_b,\n",
|
||||||
" },\n",
|
" \"stop\": [\"Which option is preferred?\"],\n",
|
||||||
" **kwargs,\n",
|
" },\n",
|
||||||
" )\n",
|
" **kwargs,\n",
|
||||||
"\n",
|
" )\n",
|
||||||
" response_text = result[\"text\"]\n",
|
"\n",
|
||||||
" reasoning, preference = response_text.split(\"Preference:\", maxsplit=1)\n",
|
" response_text = result[\"text\"]\n",
|
||||||
" preference = preference.strip()\n",
|
" reasoning, preference = response_text.split(\"Preference:\", maxsplit=1)\n",
|
||||||
" score = 1.0 if preference == \"A\" else (0.0 if preference == \"B\" else None)\n",
|
" preference = preference.strip()\n",
|
||||||
" return {\"reasoning\": reasoning.strip(), \"value\": preference, \"score\": score}"
|
" score = 1.0 if preference == \"A\" else (0.0 if preference == \"B\" else None)\n",
|
||||||
]
|
" return {\"reasoning\": reasoning.strip(), \"value\": preference, \"score\": score}"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 6,
|
"cell_type": "code",
|
||||||
"id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
|
"execution_count": 6,
|
||||||
"metadata": {
|
"id": "5cbd8b1d-2cb0-4f05-b435-a1a00074d94a",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"evaluator = CustomPreferenceEvaluator()"
|
"source": [
|
||||||
]
|
"evaluator = CustomPreferenceEvaluator()"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 7,
|
"cell_type": "code",
|
||||||
"id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
|
"execution_count": 7,
|
||||||
"metadata": {
|
"id": "2c0a7fb7-b976-4443-9f0e-e707a6dfbdf7",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'reasoning': 'Option B is preferred over option A for importing from a relative directory, because it is more straightforward and concise.\\n\\nOption A uses the importlib module, which allows importing a module by specifying the full name as a string. While this works, it is less clear compared to option B.\\n\\nOption B directly imports from the relative path using dot notation, which clearly shows that it is a relative import. This is the recommended way to do relative imports in Python.\\n\\nIn summary, option B is more accurate and helpful as it uses the standard Python relative import syntax.',\n",
|
"text/plain": [
|
||||||
" 'value': 'B',\n",
|
"{'reasoning': 'Option B is preferred over option A for importing from a relative directory, because it is more straightforward and concise.\\n\\nOption A uses the importlib module, which allows importing a module by specifying the full name as a string. While this works, it is less clear compared to option B.\\n\\nOption B directly imports from the relative path using dot notation, which clearly shows that it is a relative import. This is the recommended way to do relative imports in Python.\\n\\nIn summary, option B is more accurate and helpful as it uses the standard Python relative import syntax.',\n",
|
||||||
" 'score': 0.0}"
|
" 'value': 'B',\n",
|
||||||
]
|
" 'score': 0.0}"
|
||||||
},
|
]
|
||||||
"execution_count": 7,
|
},
|
||||||
"metadata": {},
|
"execution_count": 7,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_string_pairs(\n",
|
"source": [
|
||||||
" input=\"How do I import from a relative directory?\",\n",
|
"evaluator.evaluate_string_pairs(\n",
|
||||||
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
" input=\"How do I import from a relative directory?\",\n",
|
||||||
" prediction_b=\"from .sibling import foo\",\n",
|
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
||||||
")"
|
" prediction_b=\"from .sibling import foo\",\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 13,
|
"cell_type": "code",
|
||||||
"id": "f13a1346-7dbe-451d-b3a3-99e8fc7b753b",
|
"execution_count": 13,
|
||||||
"metadata": {
|
"id": "f13a1346-7dbe-451d-b3a3-99e8fc7b753b",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"name": "stdout",
|
{
|
||||||
"output_type": "stream",
|
"name": "stdout",
|
||||||
"text": [
|
"output_type": "stream",
|
||||||
"CustomPreferenceEvaluator requires an input string.\n"
|
"text": [
|
||||||
]
|
"CustomPreferenceEvaluator requires an input string.\n"
|
||||||
}
|
]
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"# Setting requires_input to return True adds additional validation to avoid returning a grade when insufficient data is provided to the chain.\n",
|
"source": [
|
||||||
"\n",
|
"# Setting requires_input to return True adds additional validation to avoid returning a grade when insufficient data is provided to the chain.\n",
|
||||||
"try:\n",
|
"\n",
|
||||||
" evaluator.evaluate_string_pairs(\n",
|
"try:\n",
|
||||||
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
" evaluator.evaluate_string_pairs(\n",
|
||||||
" prediction_b=\"from .sibling import foo\",\n",
|
" prediction=\"use importlib! importlib.import_module('.my_package', '.')\",\n",
|
||||||
" )\n",
|
" prediction_b=\"from .sibling import foo\",\n",
|
||||||
"except ValueError as e:\n",
|
" )\n",
|
||||||
" print(e)"
|
"except ValueError as e:\n",
|
||||||
]
|
" print(e)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": null,
|
"cell_type": "code",
|
||||||
"id": "e7829cc3-ebd1-4628-ae97-15166202e9cc",
|
"execution_count": null,
|
||||||
"metadata": {},
|
"id": "e7829cc3-ebd1-4628-ae97-15166202e9cc",
|
||||||
"outputs": [],
|
"metadata": {},
|
||||||
"source": []
|
"outputs": [],
|
||||||
}
|
"source": []
|
||||||
],
|
}
|
||||||
"metadata": {
|
],
|
||||||
"kernelspec": {
|
"metadata": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"kernelspec": {
|
||||||
"language": "python",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"name": "python3"
|
"language": "python",
|
||||||
},
|
"name": "python3"
|
||||||
"language_info": {
|
},
|
||||||
"codemirror_mode": {
|
"language_info": {
|
||||||
"name": "ipython",
|
"codemirror_mode": {
|
||||||
"version": 3
|
"name": "ipython",
|
||||||
},
|
"version": 3
|
||||||
"file_extension": ".py",
|
},
|
||||||
"mimetype": "text/x-python",
|
"file_extension": ".py",
|
||||||
"name": "python",
|
"mimetype": "text/x-python",
|
||||||
"nbconvert_exporter": "python",
|
"name": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"nbconvert_exporter": "python",
|
||||||
"version": "3.11.2"
|
"pygments_lexer": "ipython3",
|
||||||
}
|
"version": "3.11.2"
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
},
|
||||||
"nbformat_minor": 5
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
}
|
}
|
||||||
|
@ -1,232 +1,233 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"attachments": {},
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Pairwise Embedding Distance \n",
|
"# Pairwise Embedding Distance \n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/comparison/pairwise_embedding_distance.ipynb)\n",
|
||||||
"One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
"\n",
|
||||||
"\n",
|
"One way to measure the similarity (or dissimilarity) between two predictions on a shared or similar input is to embed the predictions and compute a vector distance between the two embeddings.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||||
"You can load the `pairwise_embedding_distance` evaluator to do this.\n",
|
"\n",
|
||||||
"\n",
|
"You can load the `pairwise_embedding_distance` evaluator to do this.\n",
|
||||||
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the outputs are, according to their embedded representation.\n",
|
"\n",
|
||||||
"\n",
|
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the outputs are, according to their embedded representation.\n",
|
||||||
"Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
|
"\n",
|
||||||
]
|
"Check out the reference docs for the [PairwiseEmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.PairwiseEmbeddingDistanceEvalChain) for more info."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 1,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.evaluation import load_evaluator\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
"evaluator = load_evaluator(\"pairwise_embedding_distance\")"
|
"\n",
|
||||||
]
|
"evaluator = load_evaluator(\"pairwise_embedding_distance\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 2,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.0966466944859925}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.0966466944859925}"
|
||||||
},
|
]
|
||||||
"execution_count": 2,
|
},
|
||||||
"metadata": {},
|
"execution_count": 2,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_string_pairs(\n",
|
"source": [
|
||||||
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
"evaluator.evaluate_string_pairs(\n",
|
||||||
")"
|
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 3,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.03761174337464557}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.03761174337464557}"
|
||||||
},
|
]
|
||||||
"execution_count": 3,
|
},
|
||||||
"metadata": {},
|
"execution_count": 3,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_string_pairs(\n",
|
"source": [
|
||||||
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
"evaluator.evaluate_string_pairs(\n",
|
||||||
")"
|
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## Select the Distance Metric\n",
|
"source": [
|
||||||
"\n",
|
"## Select the Distance Metric\n",
|
||||||
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
"\n",
|
||||||
]
|
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 4,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 4,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
"text/plain": [
|
||||||
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
||||||
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
||||||
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
||||||
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
||||||
]
|
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
||||||
},
|
]
|
||||||
"execution_count": 4,
|
},
|
||||||
"metadata": {},
|
"execution_count": 4,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"from langchain.evaluation import EmbeddingDistance\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import EmbeddingDistance\n",
|
||||||
"list(EmbeddingDistance)"
|
"\n",
|
||||||
]
|
"list(EmbeddingDistance)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 5,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 5,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"evaluator = load_evaluator(\n",
|
"source": [
|
||||||
" \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
"evaluator = load_evaluator(\n",
|
||||||
")"
|
" \"pairwise_embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## Select Embeddings to Use\n",
|
"source": [
|
||||||
"\n",
|
"## Select Embeddings to Use\n",
|
||||||
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
"\n",
|
||||||
]
|
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": null,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": null,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||||
"embedding_model = HuggingFaceEmbeddings()\n",
|
"\n",
|
||||||
"hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
|
"embedding_model = HuggingFaceEmbeddings()\n",
|
||||||
]
|
"hf_evaluator = load_evaluator(\"pairwise_embedding_distance\", embeddings=embedding_model)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 10,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 10,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.5486443280477362}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.5486443280477362}"
|
||||||
},
|
]
|
||||||
"execution_count": 10,
|
},
|
||||||
"metadata": {},
|
"execution_count": 10,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"hf_evaluator.evaluate_string_pairs(\n",
|
"source": [
|
||||||
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
"hf_evaluator.evaluate_string_pairs(\n",
|
||||||
")"
|
" prediction=\"Seattle is hot in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 12,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 12,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.21018880025138598}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.21018880025138598}"
|
||||||
},
|
]
|
||||||
"execution_count": 12,
|
},
|
||||||
"metadata": {},
|
"execution_count": 12,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"hf_evaluator.evaluate_string_pairs(\n",
|
"source": [
|
||||||
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
"hf_evaluator.evaluate_string_pairs(\n",
|
||||||
")"
|
" prediction=\"Seattle is warm in June\", prediction_b=\"Seattle is cool in June.\"\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
|
"source": [
|
||||||
]
|
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the `PairwiseStringDistanceEvalChain`), though it tends to be less reliable than evaluators that use the LLM directly (such as the `PairwiseStringEvalChain`) </i>"
|
||||||
}
|
]
|
||||||
],
|
}
|
||||||
"metadata": {
|
],
|
||||||
"kernelspec": {
|
"metadata": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"kernelspec": {
|
||||||
"language": "python",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"name": "python3"
|
"language": "python",
|
||||||
},
|
"name": "python3"
|
||||||
"language_info": {
|
},
|
||||||
"codemirror_mode": {
|
"language_info": {
|
||||||
"name": "ipython",
|
"codemirror_mode": {
|
||||||
"version": 3
|
"name": "ipython",
|
||||||
},
|
"version": 3
|
||||||
"file_extension": ".py",
|
},
|
||||||
"mimetype": "text/x-python",
|
"file_extension": ".py",
|
||||||
"name": "python",
|
"mimetype": "text/x-python",
|
||||||
"nbconvert_exporter": "python",
|
"name": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"nbconvert_exporter": "python",
|
||||||
"version": "3.11.2"
|
"pygments_lexer": "ipython3",
|
||||||
}
|
"version": "3.11.2"
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
},
|
||||||
"nbformat_minor": 4
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
}
|
}
|
@ -1,447 +1,448 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Comparing Chain Outputs\n",
|
"# Comparing Chain Outputs\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/examples/comparisons.ipynb)\n",
|
||||||
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
"\n",
|
||||||
"\n",
|
"Suppose you have two different prompts (or LLMs). How do you know which will generate \"better\" results?\n",
|
||||||
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
"\n",
|
||||||
"\n",
|
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
|
||||||
"For this evaluation, we will need 3 things:\n",
|
"\n",
|
||||||
"1. An evaluator\n",
|
"For this evaluation, we will need 3 things:\n",
|
||||||
"2. A dataset of inputs\n",
|
"1. An evaluator\n",
|
||||||
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
"2. A dataset of inputs\n",
|
||||||
"\n",
|
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",
|
||||||
"Then we will aggregate the restults to determine the preferred model.\n",
|
"\n",
|
||||||
"\n",
|
"Then we will aggregate the restults to determine the preferred model.\n",
|
||||||
"### Step 1. Create the Evaluator\n",
|
"\n",
|
||||||
"\n",
|
"### Step 1. Create the Evaluator\n",
|
||||||
"In this example, you will use gpt-4 to select which output is preferred."
|
"\n",
|
||||||
]
|
"In this example, you will use gpt-4 to select which output is preferred."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 1,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.evaluation import load_evaluator\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
"eval_chain = load_evaluator(\"pairwise_string\")"
|
"\n",
|
||||||
]
|
"eval_chain = load_evaluator(\"pairwise_string\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"### Step 2. Select Dataset\n",
|
"source": [
|
||||||
"\n",
|
"### Step 2. Select Dataset\n",
|
||||||
"If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
|
"\n",
|
||||||
"provide more reliable results. We will use some example queries someone might have about how to use langchain here."
|
"If you already have real usage data for your LLM, you can use a representative sample. More examples\n",
|
||||||
]
|
"provide more reliable results. We will use some example queries someone might have about how to use langchain here."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 2,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"name": "stderr",
|
{
|
||||||
"output_type": "stream",
|
"name": "stderr",
|
||||||
"text": [
|
"output_type": "stream",
|
||||||
"Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
|
"text": [
|
||||||
]
|
"Found cached dataset parquet (/Users/wfh/.cache/huggingface/datasets/LangChainDatasets___parquet/LangChainDatasets--langchain-howto-queries-bbb748bbee7e77aa/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "a2358d37246640ce95e0f9940194590a",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation.loading import load_dataset\n",
|
||||||
|
"\n",
|
||||||
|
"dataset = load_dataset(\"langchain-howto-queries\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Step 3. Define Models to Compare\n",
|
||||||
|
"\n",
|
||||||
|
"We will be comparing two agents in this case."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.utilities import SerpAPIWrapper\n",
|
||||||
|
"from langchain.agents import initialize_agent, Tool\n",
|
||||||
|
"from langchain.agents import AgentType\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize the language model\n",
|
||||||
|
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
|
||||||
|
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize the SerpAPIWrapper for search functionality\n",
|
||||||
|
"# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
||||||
|
"search = SerpAPIWrapper()\n",
|
||||||
|
"\n",
|
||||||
|
"# Define a list of tools offered by the agent\n",
|
||||||
|
"tools = [\n",
|
||||||
|
" Tool(\n",
|
||||||
|
" name=\"Search\",\n",
|
||||||
|
" func=search.run,\n",
|
||||||
|
" coroutine=search.arun,\n",
|
||||||
|
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
|
||||||
|
" ),\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"functions_agent = initialize_agent(\n",
|
||||||
|
" tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
|
||||||
|
")\n",
|
||||||
|
"conversations_agent = initialize_agent(\n",
|
||||||
|
" tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Step 4. Generate Responses\n",
|
||||||
|
"\n",
|
||||||
|
"We will generate outputs for each of the models before evaluating them."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "87277cb39a1a4726bb7cc533a24e2ea4",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/20 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from tqdm.notebook import tqdm\n",
|
||||||
|
"import asyncio\n",
|
||||||
|
"\n",
|
||||||
|
"results = []\n",
|
||||||
|
"agents = [functions_agent, conversations_agent]\n",
|
||||||
|
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
||||||
|
"\n",
|
||||||
|
"# We will only run the first 20 examples of this dataset to speed things up\n",
|
||||||
|
"# This will lead to larger confidence intervals downstream.\n",
|
||||||
|
"batch = []\n",
|
||||||
|
"for example in tqdm(dataset[:20]):\n",
|
||||||
|
" batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
|
||||||
|
" if len(batch) >= concurrency_level:\n",
|
||||||
|
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||||
|
" results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
|
||||||
|
" batch = []\n",
|
||||||
|
"if batch:\n",
|
||||||
|
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
||||||
|
" results.extend(list(zip(*[iter(batch_results)] * 2)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Step 5. Evaluate Pairs\n",
|
||||||
|
"\n",
|
||||||
|
"Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
|
||||||
|
"\n",
|
||||||
|
"Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def predict_preferences(dataset, results) -> list:\n",
|
||||||
|
" preferences = []\n",
|
||||||
|
"\n",
|
||||||
|
" for example, (res_a, res_b) in zip(dataset, results):\n",
|
||||||
|
" input_ = example[\"inputs\"]\n",
|
||||||
|
" # Flip a coin to reduce persistent position bias\n",
|
||||||
|
" if random.random() < 0.5:\n",
|
||||||
|
" pred_a, pred_b = res_a, res_b\n",
|
||||||
|
" a, b = \"a\", \"b\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" pred_a, pred_b = res_b, res_a\n",
|
||||||
|
" a, b = \"b\", \"a\"\n",
|
||||||
|
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
||||||
|
" prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
|
||||||
|
" prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
|
||||||
|
" input=input_,\n",
|
||||||
|
" )\n",
|
||||||
|
" if eval_res[\"value\"] == \"A\":\n",
|
||||||
|
" preferences.append(a)\n",
|
||||||
|
" elif eval_res[\"value\"] == \"B\":\n",
|
||||||
|
" preferences.append(b)\n",
|
||||||
|
" else:\n",
|
||||||
|
" preferences.append(None) # No preference\n",
|
||||||
|
" return preferences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"preferences = predict_preferences(dataset, results)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"**Print out the ratio of preferences.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"OpenAI Functions Agent: 95.00%\n",
|
||||||
|
"None: 5.00%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"\n",
|
||||||
|
"name_map = {\n",
|
||||||
|
" \"a\": \"OpenAI Functions Agent\",\n",
|
||||||
|
" \"b\": \"Structured Chat Agent\",\n",
|
||||||
|
"}\n",
|
||||||
|
"counts = Counter(preferences)\n",
|
||||||
|
"pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
|
||||||
|
"for k, v in pref_ratios.items():\n",
|
||||||
|
" print(f\"{name_map.get(k)}: {v:.2%}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Estimate Confidence Intervals\n",
|
||||||
|
"\n",
|
||||||
|
"The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
|
||||||
|
"\n",
|
||||||
|
"Below, use the Wilson score to estimate the confidence interval."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from math import sqrt\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def wilson_score_interval(\n",
|
||||||
|
" preferences: list, which: str = \"a\", z: float = 1.96\n",
|
||||||
|
") -> tuple:\n",
|
||||||
|
" \"\"\"Estimate the confidence interval using the Wilson score.\n",
|
||||||
|
"\n",
|
||||||
|
" See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
|
||||||
|
" for more details, including when to use it and when it should not be used.\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
|
||||||
|
" n_s = preferences.count(which)\n",
|
||||||
|
"\n",
|
||||||
|
" if total_preferences == 0:\n",
|
||||||
|
" return (0, 0)\n",
|
||||||
|
"\n",
|
||||||
|
" p_hat = n_s / total_preferences\n",
|
||||||
|
"\n",
|
||||||
|
" denominator = 1 + (z**2) / total_preferences\n",
|
||||||
|
" adjustment = (z / denominator) * sqrt(\n",
|
||||||
|
" p_hat * (1 - p_hat) / total_preferences\n",
|
||||||
|
" + (z**2) / (4 * total_preferences * total_preferences)\n",
|
||||||
|
" )\n",
|
||||||
|
" center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
|
||||||
|
" lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
|
||||||
|
" upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
|
||||||
|
"\n",
|
||||||
|
" return (lower_bound, upper_bound)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The \"OpenAI Functions Agent\" would be preferred between 83.18% and 100.00% percent of the time (with 95% confidence).\n",
|
||||||
|
"The \"Structured Chat Agent\" would be preferred between 0.00% and 16.82% percent of the time (with 95% confidence).\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for which_, name in name_map.items():\n",
|
||||||
|
" low, high = wilson_score_interval(preferences, which=which_)\n",
|
||||||
|
" print(\n",
|
||||||
|
" f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Print out the p-value.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The p-value is 0.00000. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||||
|
"then there is a 0.00038% chance of observing the OpenAI Functions Agent be preferred at least 19\n",
|
||||||
|
"times out of 19 trials.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_15978/384907688.py:6: DeprecationWarning: 'binom_test' is deprecated in favour of 'binomtest' from version 1.7.0 and will be removed in Scipy 1.12.0.\n",
|
||||||
|
" p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from scipy import stats\n",
|
||||||
|
"\n",
|
||||||
|
"preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
|
||||||
|
"successes = preferences.count(preferred_model)\n",
|
||||||
|
"n = len(preferences) - preferences.count(None)\n",
|
||||||
|
"p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
|
||||||
|
"print(\n",
|
||||||
|
" f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
||||||
|
"then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
|
||||||
|
"times out of {n} trials.\"\"\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
|
||||||
|
"LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
|
||||||
|
"In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
"nbformat": 4,
|
||||||
"data": {
|
"nbformat_minor": 4
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "a2358d37246640ce95e0f9940194590a",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from langchain.evaluation.loading import load_dataset\n",
|
|
||||||
"\n",
|
|
||||||
"dataset = load_dataset(\"langchain-howto-queries\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Step 3. Define Models to Compare\n",
|
|
||||||
"\n",
|
|
||||||
"We will be comparing two agents in this case."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.utilities import SerpAPIWrapper\n",
|
|
||||||
"from langchain.agents import initialize_agent, Tool\n",
|
|
||||||
"from langchain.agents import AgentType\n",
|
|
||||||
"from langchain.chat_models import ChatOpenAI\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Initialize the language model\n",
|
|
||||||
"# You can add your own OpenAI API key by adding openai_api_key=\"<your_api_key>\"\n",
|
|
||||||
"llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Initialize the SerpAPIWrapper for search functionality\n",
|
|
||||||
"# Replace <your_api_key> in openai_api_key=\"<your_api_key>\" with your actual SerpAPI key.\n",
|
|
||||||
"search = SerpAPIWrapper()\n",
|
|
||||||
"\n",
|
|
||||||
"# Define a list of tools offered by the agent\n",
|
|
||||||
"tools = [\n",
|
|
||||||
" Tool(\n",
|
|
||||||
" name=\"Search\",\n",
|
|
||||||
" func=search.run,\n",
|
|
||||||
" coroutine=search.arun,\n",
|
|
||||||
" description=\"Useful when you need to answer questions about current events. You should ask targeted questions.\",\n",
|
|
||||||
" ),\n",
|
|
||||||
"]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"functions_agent = initialize_agent(\n",
|
|
||||||
" tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False\n",
|
|
||||||
")\n",
|
|
||||||
"conversations_agent = initialize_agent(\n",
|
|
||||||
" tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Step 4. Generate Responses\n",
|
|
||||||
"\n",
|
|
||||||
"We will generate outputs for each of the models before evaluating them."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "87277cb39a1a4726bb7cc533a24e2ea4",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/20 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from tqdm.notebook import tqdm\n",
|
|
||||||
"import asyncio\n",
|
|
||||||
"\n",
|
|
||||||
"results = []\n",
|
|
||||||
"agents = [functions_agent, conversations_agent]\n",
|
|
||||||
"concurrency_level = 6 # How many concurrent agents to run. May need to decrease if OpenAI is rate limiting.\n",
|
|
||||||
"\n",
|
|
||||||
"# We will only run the first 20 examples of this dataset to speed things up\n",
|
|
||||||
"# This will lead to larger confidence intervals downstream.\n",
|
|
||||||
"batch = []\n",
|
|
||||||
"for example in tqdm(dataset[:20]):\n",
|
|
||||||
" batch.extend([agent.acall(example[\"inputs\"]) for agent in agents])\n",
|
|
||||||
" if len(batch) >= concurrency_level:\n",
|
|
||||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
|
||||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))\n",
|
|
||||||
" batch = []\n",
|
|
||||||
"if batch:\n",
|
|
||||||
" batch_results = await asyncio.gather(*batch, return_exceptions=True)\n",
|
|
||||||
" results.extend(list(zip(*[iter(batch_results)] * 2)))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"## Step 5. Evaluate Pairs\n",
|
|
||||||
"\n",
|
|
||||||
"Now it's time to evaluate the results. For each agent response, run the evaluation chain to select which output is preferred (or return a tie).\n",
|
|
||||||
"\n",
|
|
||||||
"Randomly select the input order to reduce the likelihood that one model will be preferred just because it is presented first."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import random\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def predict_preferences(dataset, results) -> list:\n",
|
|
||||||
" preferences = []\n",
|
|
||||||
"\n",
|
|
||||||
" for example, (res_a, res_b) in zip(dataset, results):\n",
|
|
||||||
" input_ = example[\"inputs\"]\n",
|
|
||||||
" # Flip a coin to reduce persistent position bias\n",
|
|
||||||
" if random.random() < 0.5:\n",
|
|
||||||
" pred_a, pred_b = res_a, res_b\n",
|
|
||||||
" a, b = \"a\", \"b\"\n",
|
|
||||||
" else:\n",
|
|
||||||
" pred_a, pred_b = res_b, res_a\n",
|
|
||||||
" a, b = \"b\", \"a\"\n",
|
|
||||||
" eval_res = eval_chain.evaluate_string_pairs(\n",
|
|
||||||
" prediction=pred_a[\"output\"] if isinstance(pred_a, dict) else str(pred_a),\n",
|
|
||||||
" prediction_b=pred_b[\"output\"] if isinstance(pred_b, dict) else str(pred_b),\n",
|
|
||||||
" input=input_,\n",
|
|
||||||
" )\n",
|
|
||||||
" if eval_res[\"value\"] == \"A\":\n",
|
|
||||||
" preferences.append(a)\n",
|
|
||||||
" elif eval_res[\"value\"] == \"B\":\n",
|
|
||||||
" preferences.append(b)\n",
|
|
||||||
" else:\n",
|
|
||||||
" preferences.append(None) # No preference\n",
|
|
||||||
" return preferences"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"preferences = predict_preferences(dataset, results)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"source": [
|
|
||||||
"**Print out the ratio of preferences.**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"OpenAI Functions Agent: 95.00%\n",
|
|
||||||
"None: 5.00%\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from collections import Counter\n",
|
|
||||||
"\n",
|
|
||||||
"name_map = {\n",
|
|
||||||
" \"a\": \"OpenAI Functions Agent\",\n",
|
|
||||||
" \"b\": \"Structured Chat Agent\",\n",
|
|
||||||
"}\n",
|
|
||||||
"counts = Counter(preferences)\n",
|
|
||||||
"pref_ratios = {k: v / len(preferences) for k, v in counts.items()}\n",
|
|
||||||
"for k, v in pref_ratios.items():\n",
|
|
||||||
" print(f\"{name_map.get(k)}: {v:.2%}\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Estimate Confidence Intervals\n",
|
|
||||||
"\n",
|
|
||||||
"The results seem pretty clear, but if you want to have a better sense of how confident we are, that model \"A\" (the OpenAI Functions Agent) is the preferred model, we can calculate confidence intervals. \n",
|
|
||||||
"\n",
|
|
||||||
"Below, use the Wilson score to estimate the confidence interval."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from math import sqrt\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def wilson_score_interval(\n",
|
|
||||||
" preferences: list, which: str = \"a\", z: float = 1.96\n",
|
|
||||||
") -> tuple:\n",
|
|
||||||
" \"\"\"Estimate the confidence interval using the Wilson score.\n",
|
|
||||||
"\n",
|
|
||||||
" See: https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval\n",
|
|
||||||
" for more details, including when to use it and when it should not be used.\n",
|
|
||||||
" \"\"\"\n",
|
|
||||||
" total_preferences = preferences.count(\"a\") + preferences.count(\"b\")\n",
|
|
||||||
" n_s = preferences.count(which)\n",
|
|
||||||
"\n",
|
|
||||||
" if total_preferences == 0:\n",
|
|
||||||
" return (0, 0)\n",
|
|
||||||
"\n",
|
|
||||||
" p_hat = n_s / total_preferences\n",
|
|
||||||
"\n",
|
|
||||||
" denominator = 1 + (z**2) / total_preferences\n",
|
|
||||||
" adjustment = (z / denominator) * sqrt(\n",
|
|
||||||
" p_hat * (1 - p_hat) / total_preferences\n",
|
|
||||||
" + (z**2) / (4 * total_preferences * total_preferences)\n",
|
|
||||||
" )\n",
|
|
||||||
" center = (p_hat + (z**2) / (2 * total_preferences)) / denominator\n",
|
|
||||||
" lower_bound = min(max(center - adjustment, 0.0), 1.0)\n",
|
|
||||||
" upper_bound = min(max(center + adjustment, 0.0), 1.0)\n",
|
|
||||||
"\n",
|
|
||||||
" return (lower_bound, upper_bound)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"The \"OpenAI Functions Agent\" would be preferred between 83.18% and 100.00% percent of the time (with 95% confidence).\n",
|
|
||||||
"The \"Structured Chat Agent\" would be preferred between 0.00% and 16.82% percent of the time (with 95% confidence).\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"for which_, name in name_map.items():\n",
|
|
||||||
" low, high = wilson_score_interval(preferences, which=which_)\n",
|
|
||||||
" print(\n",
|
|
||||||
" f'The \"{name}\" would be preferred between {low:.2%} and {high:.2%} percent of the time (with 95% confidence).'\n",
|
|
||||||
" )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"**Print out the p-value.**"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"The p-value is 0.00000. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
|
||||||
"then there is a 0.00038% chance of observing the OpenAI Functions Agent be preferred at least 19\n",
|
|
||||||
"times out of 19 trials.\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/var/folders/gf/6rnp_mbx5914kx7qmmh7xzmw0000gn/T/ipykernel_15978/384907688.py:6: DeprecationWarning: 'binom_test' is deprecated in favour of 'binomtest' from version 1.7.0 and will be removed in Scipy 1.12.0.\n",
|
|
||||||
" p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from scipy import stats\n",
|
|
||||||
"\n",
|
|
||||||
"preferred_model = max(pref_ratios, key=pref_ratios.get)\n",
|
|
||||||
"successes = preferences.count(preferred_model)\n",
|
|
||||||
"n = len(preferences) - preferences.count(None)\n",
|
|
||||||
"p_value = stats.binom_test(successes, n, p=0.5, alternative=\"two-sided\")\n",
|
|
||||||
"print(\n",
|
|
||||||
" f\"\"\"The p-value is {p_value:.5f}. If the null hypothesis is true (i.e., if the selected eval chain actually has no preference between the models),\n",
|
|
||||||
"then there is a {p_value:.5%} chance of observing the {name_map.get(preferred_model)} be preferred at least {successes}\n",
|
|
||||||
"times out of {n} trials.\"\"\"\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"<a name=\"cite_note-1\"></a>_1. Note: Automated evals are still an open research topic and are best used alongside other evaluation approaches. \n",
|
|
||||||
"LLM preferences exhibit biases, including banal ones like the order of outputs.\n",
|
|
||||||
"In choosing preferences, \"ground truth\" may not be taken into account, which may lead to scores that aren't grounded in utility._"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 4
|
|
||||||
}
|
}
|
@ -1,318 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "bce7335e-f3b2-44f3-90cc-8c0a23a89a21",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"from langchain.agents import load_tools\n",
|
|
||||||
"from langchain.agents import initialize_agent\n",
|
|
||||||
"from langchain.chat_models import ChatOpenAI\n",
|
|
||||||
"from langchain.utilities import GoogleSearchAPIWrapper\n",
|
|
||||||
"from langchain.schema import (\n",
|
|
||||||
" SystemMessage,\n",
|
|
||||||
" HumanMessage,\n",
|
|
||||||
" AIMessage\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.smith.langchain.com\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_API_KEY\"] = \"******\"\n",
|
|
||||||
"# os.environ[\"LANGCHAIN_PROJECT\"] = \"Jarvis\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"prefix_messages = [{\"role\": \"system\", \"content\": \"You are a helpful discord Chatbot.\"}]\n",
|
|
||||||
"\n",
|
|
||||||
"llm = ChatOpenAI(model_name='gpt-3.5-turbo', \n",
|
|
||||||
" temperature=0.5, \n",
|
|
||||||
" max_tokens = 2000)\n",
|
|
||||||
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
|
|
||||||
"agent = initialize_agent(tools,\n",
|
|
||||||
" llm,\n",
|
|
||||||
" agent=\"zero-shot-react-description\",\n",
|
|
||||||
" verbose=True,\n",
|
|
||||||
" handle_parsing_errors=True\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"async def on_ready():\n",
|
|
||||||
" print(f'{bot.user} has connected to Discord!')\n",
|
|
||||||
"\n",
|
|
||||||
"async def on_message(message):\n",
|
|
||||||
"\n",
|
|
||||||
" print(\"Detected bot name in message:\", message.content)\n",
|
|
||||||
"\n",
|
|
||||||
" # Capture the output of agent.run() in the response variable\n",
|
|
||||||
" response = agent.run(message.content)\n",
|
|
||||||
"\n",
|
|
||||||
" while response:\n",
|
|
||||||
" print(response)\n",
|
|
||||||
" chunk, response = response[:2000], response[2000:]\n",
|
|
||||||
" print(f\"Chunk: {chunk}\")\n",
|
|
||||||
" print(\"Response sent.\")\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 22,
|
|
||||||
"id": "1551ce9f-b6de-4035-b6d6-825722823b48",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from dataclasses import dataclass\n",
|
|
||||||
"@dataclass\n",
|
|
||||||
"class Message:\n",
|
|
||||||
" content: str"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 23,
|
|
||||||
"id": "6e6859ec-8544-4407-9663-6b53c0092903",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Detected bot name in message: Hi AI, how are you today?\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
|
||||||
"\u001b[32;1m\u001b[1;3mThis question is not something that can be answered using the available tools.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3mI need to follow the correct format for answering questions.\n",
|
|
||||||
"Action: N/A\u001b[0m\n",
|
|
||||||
"Observation: Invalid Format: Missing 'Action Input:' after 'Action:'\n",
|
|
||||||
"Thought:\u001b[32;1m\u001b[1;3m\u001b[0m\n",
|
|
||||||
"\n",
|
|
||||||
"\u001b[1m> Finished chain.\u001b[0m\n",
|
|
||||||
"Agent stopped due to iteration limit or time limit.\n",
|
|
||||||
"Chunk: Agent stopped due to iteration limit or time limit.\n",
|
|
||||||
"Response sent.\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"await on_message(Message(content=\"Hi AI, how are you today?\"))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 24,
|
|
||||||
"id": "b850294c-7f8f-4e79-adcf-47e4e3a898df",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langsmith import Client\n",
|
|
||||||
"\n",
|
|
||||||
"client = Client()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 25,
|
|
||||||
"id": "6d089ddc-69bc-45a8-b8db-9962e4f1f5ee",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from itertools import islice\n",
|
|
||||||
"\n",
|
|
||||||
"runs = list(islice(client.list_runs(), 10))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 38,
|
|
||||||
"id": "f0349fac-5a98-400f-ba03-61ed4e1332be",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"runs = sorted(runs, key=lambda x: x.start_time, reverse=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 26,
|
|
||||||
"id": "02f133f0-39ee-4b46-b443-12c1f9b76fff",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"ids = [run.id for run in runs]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 39,
|
|
||||||
"id": "3366dce4-0c38-4a7d-8111-046a58b24917",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"runs2 = list(client.list_runs(id=ids))\n",
|
|
||||||
"runs2 = sorted(runs2, key=lambda x: x.start_time, reverse=True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 42,
|
|
||||||
"id": "82915b90-39a0-47d6-9121-56a13f210f52",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['a36092d2-4ad5-4fb4-9b0d-0dba9a2ed836',\n",
|
|
||||||
" '9398e6be-964f-4aa4-8de9-ad78cd4b7074']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 42,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"[str(x) for x in ids[:2]]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 48,
|
|
||||||
"id": "f610ec91-dc48-4a17-91c5-5c4675c77abc",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langsmith.run_helpers import traceable\n",
|
|
||||||
"\n",
|
|
||||||
"@traceable(run_type=\"llm\", name=\"\"\"<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/dQw4w9WgXcQ?start=5\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" allowfullscreen></iframe>\"\"\")\n",
|
|
||||||
"def foo():\n",
|
|
||||||
" return \"bar\""
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 49,
|
|
||||||
"id": "bd317bd7-8b2a-433a-8ec3-098a84ba8e64",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'bar'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 49,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"foo()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 52,
|
|
||||||
"id": "b142519b-6885-415c-83b9-4a346fb90589",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.llms import AzureOpenAI"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "5c50bb2b-72b8-4322-9b16-d857ecd9f347",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -1,208 +1,209 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "4460f924-1738-4dc5-999f-c26383aba0a4",
|
"id": "4460f924-1738-4dc5-999f-c26383aba0a4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom String Evaluator\n",
|
"# Custom String Evaluator\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/custom.ipynb)\n",
|
||||||
"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
|
"\n",
|
||||||
"\n",
|
"You can make your own custom string evaluators by inheriting from the `StringEvaluator` class and implementing the `_evaluate_strings` (and `_aevaluate_strings` for async support) methods.\n",
|
||||||
"In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
|
"\n",
|
||||||
"[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
|
"In this example, you will create a perplexity evaluator using the HuggingFace [evaluate](https://huggingface.co/docs/evaluate/index) library.\n",
|
||||||
]
|
"[Perplexity](https://en.wikipedia.org/wiki/Perplexity) is a measure of how well the generated text would be predicted by the model used to compute the metric."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
|
"execution_count": 1,
|
||||||
"metadata": {
|
"id": "90ec5942-4b14-47b1-baff-9dd2a9f17a4e",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"# %pip install evaluate > /dev/null"
|
"source": [
|
||||||
]
|
"# %pip install evaluate > /dev/null"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
|
"execution_count": 2,
|
||||||
"metadata": {
|
"id": "54fdba68-0ae7-4102-a45b-dabab86c97ac",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from typing import Any, Optional\n",
|
"source": [
|
||||||
"\n",
|
"from typing import Any, Optional\n",
|
||||||
"from langchain.evaluation import StringEvaluator\n",
|
"\n",
|
||||||
"from evaluate import load\n",
|
"from langchain.evaluation import StringEvaluator\n",
|
||||||
"\n",
|
"from evaluate import load\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class PerplexityEvaluator(StringEvaluator):\n",
|
"\n",
|
||||||
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
"class PerplexityEvaluator(StringEvaluator):\n",
|
||||||
"\n",
|
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
||||||
" def __init__(self, model_id: str = \"gpt2\"):\n",
|
"\n",
|
||||||
" self.model_id = model_id\n",
|
" def __init__(self, model_id: str = \"gpt2\"):\n",
|
||||||
" self.metric_fn = load(\n",
|
" self.model_id = model_id\n",
|
||||||
" \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
|
" self.metric_fn = load(\n",
|
||||||
" )\n",
|
" \"perplexity\", module_type=\"metric\", model_id=self.model_id, pad_token=0\n",
|
||||||
"\n",
|
" )\n",
|
||||||
" def _evaluate_strings(\n",
|
"\n",
|
||||||
" self,\n",
|
" def _evaluate_strings(\n",
|
||||||
" *,\n",
|
" self,\n",
|
||||||
" prediction: str,\n",
|
" *,\n",
|
||||||
" reference: Optional[str] = None,\n",
|
" prediction: str,\n",
|
||||||
" input: Optional[str] = None,\n",
|
" reference: Optional[str] = None,\n",
|
||||||
" **kwargs: Any,\n",
|
" input: Optional[str] = None,\n",
|
||||||
" ) -> dict:\n",
|
" **kwargs: Any,\n",
|
||||||
" results = self.metric_fn.compute(\n",
|
" ) -> dict:\n",
|
||||||
" predictions=[prediction], model_id=self.model_id\n",
|
" results = self.metric_fn.compute(\n",
|
||||||
" )\n",
|
" predictions=[prediction], model_id=self.model_id\n",
|
||||||
" ppl = results[\"perplexities\"][0]\n",
|
" )\n",
|
||||||
" return {\"score\": ppl}"
|
" ppl = results[\"perplexities\"][0]\n",
|
||||||
]
|
" return {\"score\": ppl}"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
|
"execution_count": 3,
|
||||||
"metadata": {
|
"id": "52767568-8075-4f77-93c9-80e1a7e5cba3",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"evaluator = PerplexityEvaluator()"
|
"source": [
|
||||||
]
|
"evaluator = PerplexityEvaluator()"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 4,
|
"cell_type": "code",
|
||||||
"id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
|
"execution_count": 4,
|
||||||
"metadata": {
|
"id": "697ee0c0-d1ae-4a55-a542-a0f8e602c28a",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"name": "stderr",
|
{
|
||||||
"output_type": "stream",
|
"name": "stderr",
|
||||||
"text": [
|
"output_type": "stream",
|
||||||
"Using pad_token, but it is not set yet.\n"
|
"text": [
|
||||||
]
|
"Using pad_token, but it is not set yet.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
||||||
|
"To disable this warning, you can either:\n",
|
||||||
|
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
||||||
|
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "467109d44654486e8b415288a319fc2c",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 190.3675537109375}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Using pad_token, but it is not set yet.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/plain": [
|
||||||
|
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1982.0709228515625}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
|
||||||
|
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
},
|
},
|
||||||
{
|
"nbformat": 4,
|
||||||
"name": "stdout",
|
"nbformat_minor": 5
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
|
||||||
"To disable this warning, you can either:\n",
|
|
||||||
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
|
||||||
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "467109d44654486e8b415288a319fc2c",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"{'score': 190.3675537109375}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on the plain.\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "5089d9d1-eae6-4d47-b4f6-479e5d887d74",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Using pad_token, but it is not set yet.\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"application/vnd.jupyter.widget-view+json": {
|
|
||||||
"model_id": "d3266f6f06d746e1bb03ce4aca07d9b9",
|
|
||||||
"version_major": 2,
|
|
||||||
"version_minor": 0
|
|
||||||
},
|
|
||||||
"text/plain": [
|
|
||||||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "display_data"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"{'score': 1982.0709228515625}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# The perplexity is much higher since LangChain was introduced after 'gpt-2' was released and because it is never used in the following context.\n",
|
|
||||||
"evaluator.evaluate_strings(prediction=\"The rains in Spain fall mainly on LangChain.\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "5eaa178f-6ba3-47ae-b3dc-1b196af6d213",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
}
|
@ -1,223 +1,224 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"tags": []
|
"tags": []
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Embedding Distance\n",
|
"# Embedding Distance\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/embedding_distance.ipynb)\n",
|
||||||
"To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
"\n",
|
||||||
"\n",
|
"To measure semantic similarity (or dissimilarity) between a prediction and a reference label string, you could use a vector vector distance metric the two embedded representations using the `embedding_distance` evaluator.<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
|
"\n",
|
||||||
"\n",
|
"**Note:** This returns a **distance** score, meaning that the lower the number, the **more** similar the prediction is to the reference, according to their embedded representation.\n",
|
||||||
"Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
|
"\n",
|
||||||
]
|
"Check out the reference docs for the [EmbeddingDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain.html#langchain.evaluation.embedding_distance.base.EmbeddingDistanceEvalChain) for more info."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 1,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.evaluation import load_evaluator\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
"evaluator = load_evaluator(\"embedding_distance\")"
|
"\n",
|
||||||
]
|
"evaluator = load_evaluator(\"embedding_distance\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 2,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.0966466944859925}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.0966466944859925}"
|
||||||
},
|
]
|
||||||
"execution_count": 2,
|
},
|
||||||
"metadata": {},
|
"execution_count": 2,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
"source": [
|
||||||
]
|
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 3,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.03761174337464557}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.03761174337464557}"
|
||||||
},
|
]
|
||||||
"execution_count": 3,
|
},
|
||||||
"metadata": {},
|
"execution_count": 3,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
"source": [
|
||||||
]
|
"evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## Select the Distance Metric\n",
|
"source": [
|
||||||
"\n",
|
"## Select the Distance Metric\n",
|
||||||
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
"\n",
|
||||||
]
|
"By default, the evalutor uses cosine distance. You can choose a different distance metric if you'd like. "
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 4,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 4,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
"text/plain": [
|
||||||
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
"[<EmbeddingDistance.COSINE: 'cosine'>,\n",
|
||||||
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
" <EmbeddingDistance.EUCLIDEAN: 'euclidean'>,\n",
|
||||||
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
" <EmbeddingDistance.MANHATTAN: 'manhattan'>,\n",
|
||||||
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
" <EmbeddingDistance.CHEBYSHEV: 'chebyshev'>,\n",
|
||||||
]
|
" <EmbeddingDistance.HAMMING: 'hamming'>]"
|
||||||
},
|
]
|
||||||
"execution_count": 4,
|
},
|
||||||
"metadata": {},
|
"execution_count": 4,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"from langchain.evaluation import EmbeddingDistance\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import EmbeddingDistance\n",
|
||||||
"list(EmbeddingDistance)"
|
"\n",
|
||||||
]
|
"list(EmbeddingDistance)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 5,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 5,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"# You can load by enum or by raw python string\n",
|
"source": [
|
||||||
"evaluator = load_evaluator(\n",
|
"# You can load by enum or by raw python string\n",
|
||||||
" \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
"evaluator = load_evaluator(\n",
|
||||||
")"
|
" \"embedding_distance\", distance_metric=EmbeddingDistance.EUCLIDEAN\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## Select Embeddings to Use\n",
|
"source": [
|
||||||
"\n",
|
"## Select Embeddings to Use\n",
|
||||||
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
"\n",
|
||||||
]
|
"The constructor uses `OpenAI` embeddings by default, but you can configure this however you want. Below, use huggingface local embeddings"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 6,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 6,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.embeddings import HuggingFaceEmbeddings\n",
|
||||||
"embedding_model = HuggingFaceEmbeddings()\n",
|
"\n",
|
||||||
"hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
|
"embedding_model = HuggingFaceEmbeddings()\n",
|
||||||
]
|
"hf_evaluator = load_evaluator(\"embedding_distance\", embeddings=embedding_model)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 7,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 7,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.5486443280477362}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.5486443280477362}"
|
||||||
},
|
]
|
||||||
"execution_count": 7,
|
},
|
||||||
"metadata": {},
|
"execution_count": 7,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
"source": [
|
||||||
]
|
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I shan't go\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 8,
|
"cell_type": "code",
|
||||||
"metadata": {
|
"execution_count": 8,
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.21018880025138598}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.21018880025138598}"
|
||||||
},
|
]
|
||||||
"execution_count": 8,
|
},
|
||||||
"metadata": {},
|
"execution_count": 8,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
"source": [
|
||||||
]
|
"hf_evaluator.evaluate_strings(prediction=\"I shall go\", reference=\"I will go\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"metadata": {},
|
"cell_type": "markdown",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
|
"source": [
|
||||||
]
|
"<a name=\"cite_note-1\"></a><i>1. Note: When it comes to semantic similarity, this often gives better results than older string distance metrics (such as those in the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain)), though it tends to be less reliable than evaluators that use the LLM directly (such as the [QAEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.qa.eval_chain.QAEvalChain.html#langchain.evaluation.qa.eval_chain.QAEvalChain) or [LabeledCriteriaEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain.html#langchain.evaluation.criteria.eval_chain.LabeledCriteriaEvalChain)) </i>"
|
||||||
}
|
]
|
||||||
],
|
}
|
||||||
"metadata": {
|
],
|
||||||
"kernelspec": {
|
"metadata": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"kernelspec": {
|
||||||
"language": "python",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"name": "python3"
|
"language": "python",
|
||||||
},
|
"name": "python3"
|
||||||
"language_info": {
|
},
|
||||||
"codemirror_mode": {
|
"language_info": {
|
||||||
"name": "ipython",
|
"codemirror_mode": {
|
||||||
"version": 3
|
"name": "ipython",
|
||||||
},
|
"version": 3
|
||||||
"file_extension": ".py",
|
},
|
||||||
"mimetype": "text/x-python",
|
"file_extension": ".py",
|
||||||
"name": "python",
|
"mimetype": "text/x-python",
|
||||||
"nbconvert_exporter": "python",
|
"name": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"nbconvert_exporter": "python",
|
||||||
"version": "3.11.2"
|
"pygments_lexer": "ipython3",
|
||||||
}
|
"version": "3.11.2"
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
},
|
||||||
"nbformat_minor": 4
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
}
|
}
|
@ -0,0 +1,175 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2da95378",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Exact Match\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/exact_match.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"Probably the simplest ways to evaluate an LLM or runnable's string output against a reference label is by a simple string equivalence.\n",
|
||||||
|
"\n",
|
||||||
|
"This can be accessed using the `exact_match` evaluator."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import ExactMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = ExactMatchStringEvaluator()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Alternatively via the loader:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f6790c46",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = load_evaluator(\"exact_match\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "49ad9139",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"1 LLM.\",\n",
|
||||||
|
" reference=\"2 llm\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"LangChain\",\n",
|
||||||
|
" reference=\"langchain\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configure the ExactMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"You can relax the \"exactness\" when comparing strings."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"evaluator = ExactMatchStringEvaluator(\n",
|
||||||
|
" ignore_case=True,\n",
|
||||||
|
" ignore_numbers=True,\n",
|
||||||
|
" ignore_punctuation=True,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Alternatively\n",
|
||||||
|
"# evaluator = load_evaluator(\"exact_match\", ignore_case=True, ignore_numbers=True, ignore_punctuation=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"1 LLM.\",\n",
|
||||||
|
" reference=\"2 llm\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,243 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "2da95378",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Regex Match\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/regex_match.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"To evaluate chain or runnable string predictions against a custom regex, you can use the `regex_match` evaluator."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "0de44d01-1fea-4701-b941-c4fb74e521e7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import RegexMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = RegexMatchStringEvaluator()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fe3baf5f-bfee-4745-bcd6-1a9b422ed46f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Alternatively via the loader:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f6790c46",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = load_evaluator(\"regex_match\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "49ad9139",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a YYYY-MM-DD string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 2024-01-05\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "1f5e82a3-247e-45a8-85fc-6af53bf7ff82",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 2024-01-05\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "168fcd92-dffb-4345-b097-02d0fedf52fd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string.\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 01-05-2024\",\n",
|
||||||
|
" reference=\".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1d82dab5-6a49-4fe7-b3fb-8bcfb27d26e0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Match against multiple patterns\n",
|
||||||
|
"\n",
|
||||||
|
"To match against multiple patterns, use a regex union \"|\"."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "b87b915e-b7c2-476b-a452-99688a22293a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for the presence of a MM-DD-YYYY string or YYYY-MM-DD\n",
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"The delivery will be made on 01-05-2024\",\n",
|
||||||
|
" reference=\"|\".join([\".*\\\\b\\\\d{4}-\\\\d{2}-\\\\d{2}\\\\b.*\", \".*\\\\b\\\\d{2}-\\\\d{2}-\\\\d{4}\\\\b.*\"])\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Configure the RegexMatchStringEvaluator\n",
|
||||||
|
"\n",
|
||||||
|
"You can specify any regex flags to use when matching."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"evaluator = RegexMatchStringEvaluator(\n",
|
||||||
|
" flags=re.IGNORECASE\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# Alternatively\n",
|
||||||
|
"# evaluator = load_evaluator(\"exact_match\", flags=re.IGNORECASE)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'score': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"evaluator.evaluate_strings(\n",
|
||||||
|
" prediction=\"I LOVE testing\",\n",
|
||||||
|
" reference=\"I love testing\",\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "82de8d3e-c829-440e-a582-3fb70cecad3b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -1,222 +1,223 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "2da95378",
|
"id": "2da95378",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# String Distance\n",
|
"# String Distance\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/string/string_distance.ipynb)\n",
|
||||||
"One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance. This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
|
"\n",
|
||||||
"\n",
|
"One of the simplest ways to compare an LLM or chain's string output against a reference label is by using string distance measurements such as Levenshtein or postfix distance. This can be used alongside approximate/fuzzy matching criteria for very basic unit testing.\n",
|
||||||
"This can be accessed using the `string_distance` evaluator, which uses distance metric's from the [rapidfuzz](https://github.com/maxbachmann/RapidFuzz) library.\n",
|
"\n",
|
||||||
"\n",
|
"This can be accessed using the `string_distance` evaluator, which uses distance metric's from the [rapidfuzz](https://github.com/maxbachmann/RapidFuzz) library.\n",
|
||||||
"**Note:** The returned scores are _distances_, meaning lower is typically \"better\".\n",
|
"\n",
|
||||||
"\n",
|
"**Note:** The returned scores are _distances_, meaning lower is typically \"better\".\n",
|
||||||
"For more information, check out the reference docs for the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain) for more info."
|
"\n",
|
||||||
]
|
"For more information, check out the reference docs for the [StringDistanceEvalChain](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.string_distance.base.StringDistanceEvalChain.html#langchain.evaluation.string_distance.base.StringDistanceEvalChain) for more info."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"id": "8b47b909-3251-4774-9a7d-e436da4f8979",
|
"execution_count": 1,
|
||||||
"metadata": {
|
"id": "8b47b909-3251-4774-9a7d-e436da4f8979",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"# %pip install rapidfuzz"
|
"source": [
|
||||||
]
|
"# %pip install rapidfuzz"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 2,
|
"cell_type": "code",
|
||||||
"id": "f6790c46",
|
"execution_count": 2,
|
||||||
"metadata": {
|
"id": "f6790c46",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from langchain.evaluation import load_evaluator\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import load_evaluator\n",
|
||||||
"evaluator = load_evaluator(\"string_distance\")"
|
"\n",
|
||||||
]
|
"evaluator = load_evaluator(\"string_distance\")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"id": "49ad9139",
|
"execution_count": 3,
|
||||||
"metadata": {
|
"id": "49ad9139",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.11555555555555552}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.11555555555555552}"
|
||||||
},
|
]
|
||||||
"execution_count": 3,
|
},
|
||||||
"metadata": {},
|
"execution_count": 3,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator.evaluate_strings(\n",
|
"source": [
|
||||||
" prediction=\"The job is completely done.\",\n",
|
"evaluator.evaluate_strings(\n",
|
||||||
" reference=\"The job is done\",\n",
|
" prediction=\"The job is completely done.\",\n",
|
||||||
")"
|
" reference=\"The job is done\",\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 4,
|
"cell_type": "code",
|
||||||
"id": "c06a2296",
|
"execution_count": 4,
|
||||||
"metadata": {
|
"id": "c06a2296",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.0724999999999999}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.0724999999999999}"
|
||||||
},
|
]
|
||||||
"execution_count": 4,
|
},
|
||||||
"metadata": {},
|
"execution_count": 4,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"# The results purely character-based, so it's less useful when negation is concerned\n",
|
"source": [
|
||||||
"evaluator.evaluate_strings(\n",
|
"# The results purely character-based, so it's less useful when negation is concerned\n",
|
||||||
" prediction=\"The job is done.\",\n",
|
"evaluator.evaluate_strings(\n",
|
||||||
" reference=\"The job isn't done\",\n",
|
" prediction=\"The job is done.\",\n",
|
||||||
")"
|
" reference=\"The job isn't done\",\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"id": "b8ed1f12-09a6-4e90-a69d-c8df525ff293",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"## Configure the String Distance Metric\n",
|
"source": [
|
||||||
"\n",
|
"## Configure the String Distance Metric\n",
|
||||||
"By default, the `StringDistanceEvalChain` uses levenshtein distance, but it also supports other string distance algorithms. Configure using the `distance` argument."
|
"\n",
|
||||||
]
|
"By default, the `StringDistanceEvalChain` uses levenshtein distance, but it also supports other string distance algorithms. Configure using the `distance` argument."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 5,
|
"cell_type": "code",
|
||||||
"id": "a88bc7d7-62d3-408d-b0e0-43abcecf35c8",
|
"execution_count": 5,
|
||||||
"metadata": {
|
"id": "a88bc7d7-62d3-408d-b0e0-43abcecf35c8",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"[<StringDistance.DAMERAU_LEVENSHTEIN: 'damerau_levenshtein'>,\n",
|
"text/plain": [
|
||||||
" <StringDistance.LEVENSHTEIN: 'levenshtein'>,\n",
|
"[<StringDistance.DAMERAU_LEVENSHTEIN: 'damerau_levenshtein'>,\n",
|
||||||
" <StringDistance.JARO: 'jaro'>,\n",
|
" <StringDistance.LEVENSHTEIN: 'levenshtein'>,\n",
|
||||||
" <StringDistance.JARO_WINKLER: 'jaro_winkler'>]"
|
" <StringDistance.JARO: 'jaro'>,\n",
|
||||||
]
|
" <StringDistance.JARO_WINKLER: 'jaro_winkler'>]"
|
||||||
},
|
]
|
||||||
"execution_count": 5,
|
},
|
||||||
"metadata": {},
|
"execution_count": 5,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"from langchain.evaluation import StringDistance\n",
|
"source": [
|
||||||
"\n",
|
"from langchain.evaluation import StringDistance\n",
|
||||||
"list(StringDistance)"
|
"\n",
|
||||||
]
|
"list(StringDistance)"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 6,
|
"cell_type": "code",
|
||||||
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
"execution_count": 6,
|
||||||
"metadata": {
|
"id": "0c079864-0175-4d06-9d3f-a0e51dd3977c",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [],
|
},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"jaro_evaluator = load_evaluator(\n",
|
"source": [
|
||||||
" \"string_distance\", distance=StringDistance.JARO\n",
|
"jaro_evaluator = load_evaluator(\n",
|
||||||
")"
|
" \"string_distance\", distance=StringDistance.JARO\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 7,
|
"cell_type": "code",
|
||||||
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"id": "a8dfb900-14f3-4a1f-8736-dd1d86a1264c",
|
||||||
"outputs": [
|
"metadata": {},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.19259259259259254}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.19259259259259254}"
|
||||||
},
|
]
|
||||||
"execution_count": 7,
|
},
|
||||||
"metadata": {},
|
"execution_count": 7,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"jaro_evaluator.evaluate_strings(\n",
|
"source": [
|
||||||
" prediction=\"The job is completely done.\",\n",
|
"jaro_evaluator.evaluate_strings(\n",
|
||||||
" reference=\"The job is done\",\n",
|
" prediction=\"The job is completely done.\",\n",
|
||||||
")"
|
" reference=\"The job is done\",\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 8,
|
"cell_type": "code",
|
||||||
"id": "7020b046-0ef7-40cc-8778-b928e35f3ce1",
|
"execution_count": 8,
|
||||||
"metadata": {
|
"id": "7020b046-0ef7-40cc-8778-b928e35f3ce1",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 0.12083333333333324}"
|
"text/plain": [
|
||||||
]
|
"{'score': 0.12083333333333324}"
|
||||||
},
|
]
|
||||||
"execution_count": 8,
|
},
|
||||||
"metadata": {},
|
"execution_count": 8,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"jaro_evaluator.evaluate_strings(\n",
|
"source": [
|
||||||
" prediction=\"The job is done.\",\n",
|
"jaro_evaluator.evaluate_strings(\n",
|
||||||
" reference=\"The job isn't done\",\n",
|
" prediction=\"The job is done.\",\n",
|
||||||
")"
|
" reference=\"The job isn't done\",\n",
|
||||||
]
|
")"
|
||||||
}
|
]
|
||||||
],
|
}
|
||||||
"metadata": {
|
],
|
||||||
"kernelspec": {
|
"metadata": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"kernelspec": {
|
||||||
"language": "python",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"name": "python3"
|
"language": "python",
|
||||||
},
|
"name": "python3"
|
||||||
"language_info": {
|
},
|
||||||
"codemirror_mode": {
|
"language_info": {
|
||||||
"name": "ipython",
|
"codemirror_mode": {
|
||||||
"version": 3
|
"name": "ipython",
|
||||||
},
|
"version": 3
|
||||||
"file_extension": ".py",
|
},
|
||||||
"mimetype": "text/x-python",
|
"file_extension": ".py",
|
||||||
"name": "python",
|
"mimetype": "text/x-python",
|
||||||
"nbconvert_exporter": "python",
|
"name": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"nbconvert_exporter": "python",
|
||||||
"version": "3.11.2"
|
"pygments_lexer": "ipython3",
|
||||||
}
|
"version": "3.11.2"
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
},
|
||||||
"nbformat_minor": 5
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
}
|
}
|
@ -1,141 +1,142 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "db9d627f-b234-4f7f-ab96-639fae474122",
|
"id": "db9d627f-b234-4f7f-ab96-639fae474122",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"# Custom Trajectory Evaluator\n",
|
"# Custom Trajectory Evaluator\n",
|
||||||
"\n",
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/evaluation/trajectory/custom.ipynb)\n",
|
||||||
"You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
|
"\n",
|
||||||
"\n",
|
"You can make your own custom trajectory evaluators by inheriting from the [AgentTrajectoryEvaluator](https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.schema.AgentTrajectoryEvaluator.html#langchain.evaluation.schema.AgentTrajectoryEvaluator) class and overwriting the `_evaluate_agent_trajectory` (and `_aevaluate_agent_action`) method.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
|
"\n",
|
||||||
]
|
"In this example, you will make a simple trajectory evaluator that uses an LLM to determine if any actions were unnecessary."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 1,
|
"cell_type": "code",
|
||||||
"id": "ca84ab0c-e7e2-4c03-bd74-9cc4e6338eec",
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"id": "ca84ab0c-e7e2-4c03-bd74-9cc4e6338eec",
|
||||||
"outputs": [],
|
"metadata": {},
|
||||||
"source": [
|
"outputs": [],
|
||||||
"from typing import Any, Optional, Sequence, Tuple\n",
|
"source": [
|
||||||
"from langchain.chat_models import ChatOpenAI\n",
|
"from typing import Any, Optional, Sequence, Tuple\n",
|
||||||
"from langchain.chains import LLMChain\n",
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
"from langchain.schema import AgentAction\n",
|
"from langchain.chains import LLMChain\n",
|
||||||
"from langchain.evaluation import AgentTrajectoryEvaluator\n",
|
"from langchain.schema import AgentAction\n",
|
||||||
"\n",
|
"from langchain.evaluation import AgentTrajectoryEvaluator\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class StepNecessityEvaluator(AgentTrajectoryEvaluator):\n",
|
"\n",
|
||||||
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
"class StepNecessityEvaluator(AgentTrajectoryEvaluator):\n",
|
||||||
"\n",
|
" \"\"\"Evaluate the perplexity of a predicted string.\"\"\"\n",
|
||||||
" def __init__(self) -> None:\n",
|
"\n",
|
||||||
" llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
|
" def __init__(self) -> None:\n",
|
||||||
" template = \"\"\"Are any of the following steps unnecessary in answering {input}? Provide the verdict on a new line as a single \"Y\" for yes or \"N\" for no.\n",
|
" llm = ChatOpenAI(model=\"gpt-4\", temperature=0.0)\n",
|
||||||
"\n",
|
" template = \"\"\"Are any of the following steps unnecessary in answering {input}? Provide the verdict on a new line as a single \"Y\" for yes or \"N\" for no.\n",
|
||||||
" DATA\n",
|
"\n",
|
||||||
" ------\n",
|
" DATA\n",
|
||||||
" Steps: {trajectory}\n",
|
" ------\n",
|
||||||
" ------\n",
|
" Steps: {trajectory}\n",
|
||||||
"\n",
|
" ------\n",
|
||||||
" Verdict:\"\"\"\n",
|
"\n",
|
||||||
" self.chain = LLMChain.from_string(llm, template)\n",
|
" Verdict:\"\"\"\n",
|
||||||
"\n",
|
" self.chain = LLMChain.from_string(llm, template)\n",
|
||||||
" def _evaluate_agent_trajectory(\n",
|
"\n",
|
||||||
" self,\n",
|
" def _evaluate_agent_trajectory(\n",
|
||||||
" *,\n",
|
" self,\n",
|
||||||
" prediction: str,\n",
|
" *,\n",
|
||||||
" input: str,\n",
|
" prediction: str,\n",
|
||||||
" agent_trajectory: Sequence[Tuple[AgentAction, str]],\n",
|
" input: str,\n",
|
||||||
" reference: Optional[str] = None,\n",
|
" agent_trajectory: Sequence[Tuple[AgentAction, str]],\n",
|
||||||
" **kwargs: Any,\n",
|
" reference: Optional[str] = None,\n",
|
||||||
" ) -> dict:\n",
|
" **kwargs: Any,\n",
|
||||||
" vals = [\n",
|
" ) -> dict:\n",
|
||||||
" f\"{i}: Action=[{action.tool}] returned observation = [{observation}]\"\n",
|
" vals = [\n",
|
||||||
" for i, (action, observation) in enumerate(agent_trajectory)\n",
|
" f\"{i}: Action=[{action.tool}] returned observation = [{observation}]\"\n",
|
||||||
" ]\n",
|
" for i, (action, observation) in enumerate(agent_trajectory)\n",
|
||||||
" trajectory = \"\\n\".join(vals)\n",
|
" ]\n",
|
||||||
" response = self.chain.run(dict(trajectory=trajectory, input=input), **kwargs)\n",
|
" trajectory = \"\\n\".join(vals)\n",
|
||||||
" decision = response.split(\"\\n\")[-1].strip()\n",
|
" response = self.chain.run(dict(trajectory=trajectory, input=input), **kwargs)\n",
|
||||||
" score = 1 if decision == \"Y\" else 0\n",
|
" decision = response.split(\"\\n\")[-1].strip()\n",
|
||||||
" return {\"score\": score, \"value\": decision, \"reasoning\": response}"
|
" score = 1 if decision == \"Y\" else 0\n",
|
||||||
]
|
" return {\"score\": score, \"value\": decision, \"reasoning\": response}"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"id": "297dea4b-fb28-4292-b6e0-1c769cfb9cbd",
|
||||||
"source": [
|
"metadata": {},
|
||||||
"The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary. It returns the string 'decision' as the 'value', and includes the rest of the generated text as 'reasoning' to let you audit the decision.\n",
|
"source": [
|
||||||
"\n",
|
"The example above will return a score of 1 if the language model predicts that any of the actions were unnecessary, and it returns a score of 0 if all of them were predicted to be necessary. It returns the string 'decision' as the 'value', and includes the rest of the generated text as 'reasoning' to let you audit the decision.\n",
|
||||||
"You can call this evaluator to grade the intermediate steps of your agent's trajectory."
|
"\n",
|
||||||
]
|
"You can call this evaluator to grade the intermediate steps of your agent's trajectory."
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "code",
|
{
|
||||||
"execution_count": 3,
|
"cell_type": "code",
|
||||||
"id": "a3fbcc1d-249f-4e00-8841-b6872c73c486",
|
"execution_count": 3,
|
||||||
"metadata": {
|
"id": "a3fbcc1d-249f-4e00-8841-b6872c73c486",
|
||||||
"tags": []
|
"metadata": {
|
||||||
},
|
"tags": []
|
||||||
"outputs": [
|
},
|
||||||
{
|
"outputs": [
|
||||||
"data": {
|
{
|
||||||
"text/plain": [
|
"data": {
|
||||||
"{'score': 1, 'value': 'Y', 'reasoning': 'Y'}"
|
"text/plain": [
|
||||||
]
|
"{'score': 1, 'value': 'Y', 'reasoning': 'Y'}"
|
||||||
},
|
]
|
||||||
"execution_count": 3,
|
},
|
||||||
"metadata": {},
|
"execution_count": 3,
|
||||||
"output_type": "execute_result"
|
"metadata": {},
|
||||||
}
|
"output_type": "execute_result"
|
||||||
],
|
}
|
||||||
"source": [
|
],
|
||||||
"evaluator = StepNecessityEvaluator()\n",
|
"source": [
|
||||||
"\n",
|
"evaluator = StepNecessityEvaluator()\n",
|
||||||
"evaluator.evaluate_agent_trajectory(\n",
|
"\n",
|
||||||
" prediction=\"The answer is pi\",\n",
|
"evaluator.evaluate_agent_trajectory(\n",
|
||||||
" input=\"What is today?\",\n",
|
" prediction=\"The answer is pi\",\n",
|
||||||
" agent_trajectory=[\n",
|
" input=\"What is today?\",\n",
|
||||||
" (\n",
|
" agent_trajectory=[\n",
|
||||||
" AgentAction(tool=\"ask\", tool_input=\"What is today?\", log=\"\"),\n",
|
" (\n",
|
||||||
" \"tomorrow's yesterday\",\n",
|
" AgentAction(tool=\"ask\", tool_input=\"What is today?\", log=\"\"),\n",
|
||||||
" ),\n",
|
" \"tomorrow's yesterday\",\n",
|
||||||
" (\n",
|
" ),\n",
|
||||||
" AgentAction(tool=\"check_tv\", tool_input=\"Watch tv for half hour\", log=\"\"),\n",
|
" (\n",
|
||||||
" \"bzzz\",\n",
|
" AgentAction(tool=\"check_tv\", tool_input=\"Watch tv for half hour\", log=\"\"),\n",
|
||||||
" ),\n",
|
" \"bzzz\",\n",
|
||||||
" ],\n",
|
" ),\n",
|
||||||
")"
|
" ],\n",
|
||||||
]
|
")"
|
||||||
},
|
]
|
||||||
{
|
},
|
||||||
"cell_type": "markdown",
|
{
|
||||||
"id": "77353528-723e-4075-939e-aebdb17c1e4f",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"id": "77353528-723e-4075-939e-aebdb17c1e4f",
|
||||||
"source": []
|
"metadata": {},
|
||||||
}
|
"source": []
|
||||||
],
|
}
|
||||||
"metadata": {
|
],
|
||||||
"kernelspec": {
|
"metadata": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"kernelspec": {
|
||||||
"language": "python",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"name": "python3"
|
"language": "python",
|
||||||
},
|
"name": "python3"
|
||||||
"language_info": {
|
},
|
||||||
"codemirror_mode": {
|
"language_info": {
|
||||||
"name": "ipython",
|
"codemirror_mode": {
|
||||||
"version": 3
|
"name": "ipython",
|
||||||
},
|
"version": 3
|
||||||
"file_extension": ".py",
|
},
|
||||||
"mimetype": "text/x-python",
|
"file_extension": ".py",
|
||||||
"name": "python",
|
"mimetype": "text/x-python",
|
||||||
"nbconvert_exporter": "python",
|
"name": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"nbconvert_exporter": "python",
|
||||||
"version": "3.11.2"
|
"pygments_lexer": "ipython3",
|
||||||
}
|
"version": "3.11.2"
|
||||||
},
|
}
|
||||||
"nbformat": 4,
|
},
|
||||||
"nbformat_minor": 5
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
}
|
}
|
@ -0,0 +1,97 @@
|
|||||||
|
import string
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.evaluation.schema import StringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
class ExactMatchStringEvaluator(StringEvaluator):
|
||||||
|
"""Compute an exact match between the prediction and the reference.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
----------
|
||||||
|
>>> evaluator = ExactMatchChain()
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="Mindy is the CTO",
|
||||||
|
) # This will return {'score': 1.0}
|
||||||
|
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="Mindy is the CEO",
|
||||||
|
) # This will return {'score': 0.0}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
ignore_case: bool = False,
|
||||||
|
ignore_punctuation: bool = False,
|
||||||
|
ignore_numbers: bool = False,
|
||||||
|
**kwargs: Any,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.ignore_case = ignore_case
|
||||||
|
self.ignore_punctuation = ignore_punctuation
|
||||||
|
self.ignore_numbers = ignore_numbers
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator does not require input.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator requires a reference.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_keys(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the input keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The input keys.
|
||||||
|
"""
|
||||||
|
return ["reference", "prediction"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the evaluation name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The evaluation name.
|
||||||
|
"""
|
||||||
|
return "exact_match"
|
||||||
|
|
||||||
|
def _evaluate_strings( # type: ignore[arg-type,override]
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
prediction: str,
|
||||||
|
reference: str,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Evaluate the exact match between the prediction and the reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prediction (str): The prediction string.
|
||||||
|
reference (Optional[str], optional): The reference string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The evaluation results containing the score.
|
||||||
|
"""
|
||||||
|
if self.ignore_case:
|
||||||
|
prediction = prediction.lower()
|
||||||
|
reference = reference.lower()
|
||||||
|
if self.ignore_punctuation:
|
||||||
|
prediction = prediction.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
reference = reference.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
if self.ignore_numbers:
|
||||||
|
prediction = prediction.translate(str.maketrans("", "", string.digits))
|
||||||
|
reference = reference.translate(str.maketrans("", "", string.digits))
|
||||||
|
return {"score": int(prediction == reference)}
|
@ -0,0 +1,86 @@
|
|||||||
|
import re
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
from langchain.evaluation.schema import StringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
class RegexMatchStringEvaluator(StringEvaluator):
|
||||||
|
"""Compute a regex match between the prediction and the reference.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
----------
|
||||||
|
>>> evaluator = RegexMatchStringEvaluator(flags=re.IGNORECASE)
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^mindy.*cto$",
|
||||||
|
) # This will return {'score': 1.0} due to the IGNORECASE flag
|
||||||
|
|
||||||
|
>>> evaluator = RegexMatchStringEvaluator()
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^Mike.*CEO$",
|
||||||
|
) # This will return {'score': 0.0}
|
||||||
|
|
||||||
|
>>> evaluator.evaluate_strings(
|
||||||
|
prediction="Mindy is the CTO",
|
||||||
|
reference="^Mike.*CEO$|^Mindy.*CTO$",
|
||||||
|
) # This will return {'score': 1.0} as the prediction matches the second pattern in the union
|
||||||
|
""" # noqa: E501
|
||||||
|
|
||||||
|
def __init__(self, *, flags: int = 0, **kwargs: Any): # Default is no flags
|
||||||
|
super().__init__()
|
||||||
|
self.flags = flags
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_input(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator does not require input.
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def requires_reference(self) -> bool:
|
||||||
|
"""
|
||||||
|
This evaluator requires a reference.
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def input_keys(self) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get the input keys.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: The input keys.
|
||||||
|
"""
|
||||||
|
return ["reference", "prediction"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def evaluation_name(self) -> str:
|
||||||
|
"""
|
||||||
|
Get the evaluation name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The evaluation name.
|
||||||
|
"""
|
||||||
|
return "regex_match"
|
||||||
|
|
||||||
|
def _evaluate_strings( # type: ignore[arg-type,override]
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
prediction: str,
|
||||||
|
reference: str,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Evaluate the regex match between the prediction and the reference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prediction (str): The prediction string.
|
||||||
|
reference (Optional[str], optional): The reference regex pattern.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: The evaluation results containing the score.
|
||||||
|
"""
|
||||||
|
match = re.match(reference, prediction, flags=self.flags)
|
||||||
|
return {"score": int(bool(match))}
|
@ -0,0 +1,49 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.evaluation import ExactMatchStringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def exact_match_string_evaluator() -> ExactMatchStringEvaluator:
|
||||||
|
"""Create an ExactMatchStringEvaluator with default configuration."""
|
||||||
|
return ExactMatchStringEvaluator()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def exact_match_string_evaluator_ignore_case() -> ExactMatchStringEvaluator:
|
||||||
|
"""Create an ExactMatchStringEvaluator with ignore_case set to True."""
|
||||||
|
return ExactMatchStringEvaluator(ignore_case=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_exact_matching(
|
||||||
|
exact_match_string_evaluator: ExactMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "Mindy is the CTO"
|
||||||
|
result = exact_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "Mindy is the CEO"
|
||||||
|
result = exact_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_exact_matching_with_ignore_case(
|
||||||
|
exact_match_string_evaluator_ignore_case: ExactMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "mindy is the cto"
|
||||||
|
result = exact_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "mindy is the CEO"
|
||||||
|
result = exact_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
@ -0,0 +1,45 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.evaluation import RegexMatchStringEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def regex_match_string_evaluator() -> RegexMatchStringEvaluator:
|
||||||
|
"""Create a RegexMatchStringEvaluator with default configuration."""
|
||||||
|
return RegexMatchStringEvaluator()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def regex_match_string_evaluator_ignore_case() -> RegexMatchStringEvaluator:
|
||||||
|
"""Create a RegexMatchStringEvaluator with IGNORECASE flag."""
|
||||||
|
return RegexMatchStringEvaluator(flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_regex_matching(
|
||||||
|
regex_match_string_evaluator: RegexMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "^Mindy.*CTO$"
|
||||||
|
result = regex_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
||||||
|
|
||||||
|
reference = "^Mike.*CEO$"
|
||||||
|
result = regex_match_string_evaluator.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_regex_matching_with_ignore_case(
|
||||||
|
regex_match_string_evaluator_ignore_case: RegexMatchStringEvaluator,
|
||||||
|
) -> None:
|
||||||
|
prediction = "Mindy is the CTO"
|
||||||
|
reference = "^mindy.*cto$"
|
||||||
|
result = regex_match_string_evaluator_ignore_case.evaluate_strings(
|
||||||
|
prediction=prediction, reference=reference
|
||||||
|
)
|
||||||
|
assert result["score"] == 1.0
|
Loading…
Reference in New Issue