diff --git a/docs/docs/guides/evaluation/string/json.ipynb b/docs/docs/guides/evaluation/string/json.ipynb new file mode 100644 index 0000000000..baaaa0afb3 --- /dev/null +++ b/docs/docs/guides/evaluation/string/json.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "465cfbef-5bba-4b3b-b02d-fe2eba39db17", + "metadata": {}, + "source": [ + "# Evaluating Structured Output: JSON Evaluators\n", + "\n", + "Evaluating [extraction](https://python.langchain.com/docs/use_cases/extraction) and function calling applications often comes down to validation that the LLM's string output can be parsed correctly and how it compares to a reference object. The following JSON validators provide provide functionality to check your model's output in a consistent way.\n", + "\n", + "## JsonValidityEvaluator\n", + "\n", + "The `JsonValidityEvaluator` is designed to check the validity of a JSON string prediction.\n", + "\n", + "### Overview:\n", + "- **Requires Input?**: No\n", + "- **Requires Reference?**: No" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "02e5f7dd-82fe-48f9-a251-b2052e17e61c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 1}\n" + ] + } + ], + "source": [ + "from langchain.evaluation import JsonValidityEvaluator, load_evaluator\n", + "\n", + "evaluator = JsonValidityEvaluator()\n", + "# Equivalently\n", + "# evaluator = load_evaluator(\"json_validity\")\n", + "prediction = '{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}'\n", + "\n", + "result = evaluator.evaluate_strings(prediction=prediction)\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9a9607c6-edab-4c26-86c4-22b226e18aa9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes: line 1 column 48 (char 47)'}\n" + ] + } + ], + "source": [ + "prediction = '{\"name\": \"John\", \"age\": 30, \"city\": \"New York\",}'\n", + "result = evaluator.evaluate_strings(prediction=prediction)\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "8ac18a83-30d8-4c11-abf2-7a36e4cb829f", + "metadata": {}, + "source": [ + "## JsonEqualityEvaluator\n", + "\n", + "The `JsonEqualityEvaluator` assesses whether a JSON prediction matches a given reference after both are parsed.\n", + "\n", + "### Overview:\n", + "- **Requires Input?**: No\n", + "- **Requires Reference?**: Yes\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ab97111e-cba9-4273-825f-d5d4278a953c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': True}\n" + ] + } + ], + "source": [ + "from langchain.evaluation import JsonEqualityEvaluator\n", + "\n", + "evaluator = JsonEqualityEvaluator()\n", + "# Equivalently\n", + "# evaluator = load_evaluator(\"json_equality\")\n", + "result = evaluator.evaluate_strings(prediction='{\"a\": 1}', reference='{\"a\": 1}')\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "655ba486-09b6-47ce-947d-b2bd8b6f6364", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': False}\n" + ] + } + ], + "source": [ + "result = evaluator.evaluate_strings(prediction='{\"a\": 1}', reference='{\"a\": 2}')\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "1ac7e541-b7fe-46b6-bc3a-e94fe316227e", + "metadata": {}, + "source": [ + "The evaluator also by default lets you provide a dictionary directly" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "36e70ba3-4e62-483c-893a-5f328b7f303d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': False}\n" + ] + } + ], + "source": [ + "result = evaluator.evaluate_strings(prediction={\"a\": 1}, reference={\"a\": 2})\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "921d33f0-b3c2-4e9e-820c-9ec30bc5bb20", + "metadata": {}, + "source": [ + "## JsonEditDistanceEvaluator\n", + "\n", + "The `JsonEditDistanceEvaluator` computes a normalized Damerau-Levenshtein distance between two \"canonicalized\" JSON strings.\n", + "\n", + "### Overview:\n", + "- **Requires Input?**: No\n", + "- **Requires Reference?**: Yes\n", + "- **Distance Function**: Damerau-Levenshtein (by default)\n", + "\n", + "_Note: Ensure that `rapidfuzz` is installed or provide an alternative `string_distance` function to avoid an ImportError._" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "da9ec3a3-675f-4420-8ec7-cde48d8c2918", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0.07692307692307693}\n" + ] + } + ], + "source": [ + "from langchain.evaluation import JsonEditDistanceEvaluator\n", + "\n", + "evaluator = JsonEditDistanceEvaluator()\n", + "# Equivalently\n", + "# evaluator = load_evaluator(\"json_edit_distance\")\n", + "\n", + "result = evaluator.evaluate_strings(\n", + " prediction='{\"a\": 1, \"b\": 2}', reference='{\"a\": 1, \"b\": 3}'\n", + ")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "537ed58c-6a9c-402f-8f7f-07b1119a9ae0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0.0}\n" + ] + } + ], + "source": [ + "# The values are canonicalized prior to comparison\n", + "result = evaluator.evaluate_strings(\n", + " prediction=\"\"\"\n", + " {\n", + " \"b\": 3,\n", + " \"a\": 1\n", + " }\"\"\",\n", + " reference='{\"a\": 1, \"b\": 3}',\n", + ")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7a8f3ec5-1cde-4b0e-80cd-ac0ac290d375", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0.18181818181818182}\n" + ] + } + ], + "source": [ + "# Lists maintain their order, however\n", + "result = evaluator.evaluate_strings(\n", + " prediction='{\"a\": [1, 2]}', reference='{\"a\": [2, 1]}'\n", + ")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "52abec79-58ed-4ab6-9fb1-7deb1f5146cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0.14285714285714285}\n" + ] + } + ], + "source": [ + "# You can also pass in objects directly\n", + "result = evaluator.evaluate_strings(prediction={\"a\": 1}, reference={\"a\": 2})\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85afcf33-d2f4-406e-9d8f-15dc0a4772f2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/evaluation/__init__.py b/libs/langchain/langchain/evaluation/__init__.py index a2143cada3..dd87630b56 100644 --- a/libs/langchain/langchain/evaluation/__init__.py +++ b/libs/langchain/langchain/evaluation/__init__.py @@ -69,6 +69,11 @@ from langchain.evaluation.embedding_distance import ( ) from langchain.evaluation.exact_match.base import ExactMatchStringEvaluator from langchain.evaluation.loading import load_dataset, load_evaluator, load_evaluators +from langchain.evaluation.parsing.base import ( + JsonEqualityEvaluator, + JsonValidityEvaluator, +) +from langchain.evaluation.parsing.json_distance import JsonEditDistanceEvaluator from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator from langchain.evaluation.schema import ( @@ -114,4 +119,7 @@ __all__ = [ "AgentTrajectoryEvaluator", "ScoreStringEvalChain", "LabeledScoreStringEvalChain", + "JsonValidityEvaluator", + "JsonEqualityEvaluator", + "JsonEditDistanceEvaluator", ] diff --git a/libs/langchain/langchain/evaluation/loading.py b/libs/langchain/langchain/evaluation/loading.py index 21aacaf614..ac218f0ca5 100644 --- a/libs/langchain/langchain/evaluation/loading.py +++ b/libs/langchain/langchain/evaluation/loading.py @@ -19,6 +19,7 @@ from langchain.evaluation.parsing.base import ( JsonEqualityEvaluator, JsonValidityEvaluator, ) +from langchain.evaluation.parsing.json_distance import JsonEditDistanceEvaluator from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain from langchain.evaluation.regex_match.base import RegexMatchStringEvaluator from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator @@ -86,6 +87,7 @@ _EVALUATOR_MAP: Dict[ EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain, EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator, EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator, + EvaluatorType.JSON_EDIT_DISTANCE: JsonEditDistanceEvaluator, EvaluatorType.REGEX_MATCH: RegexMatchStringEvaluator, EvaluatorType.EXACT_MATCH: ExactMatchStringEvaluator, } diff --git a/libs/langchain/langchain/evaluation/parsing/base.py b/libs/langchain/langchain/evaluation/parsing/base.py index d2233b5148..1504297419 100644 --- a/libs/langchain/langchain/evaluation/parsing/base.py +++ b/libs/langchain/langchain/evaluation/parsing/base.py @@ -122,9 +122,12 @@ class JsonEqualityEvaluator(StringEvaluator): return "json_equality" def _parse_json( - self, string: str + self, + string: Any, ) -> Union[dict, list, None, float, bool, int, str]: - return parse_json_markdown(string) + if isinstance(string, str): + return parse_json_markdown(string) + return string def _evaluate_strings( self, diff --git a/libs/langchain/langchain/evaluation/parsing/json_distance.py b/libs/langchain/langchain/evaluation/parsing/json_distance.py new file mode 100644 index 0000000000..93287136d3 --- /dev/null +++ b/libs/langchain/langchain/evaluation/parsing/json_distance.py @@ -0,0 +1,91 @@ +import json +from typing import Any, Callable, Optional, Union + +from langchain.evaluation.schema import StringEvaluator +from langchain.output_parsers.json import parse_json_markdown + + +class JsonEditDistanceEvaluator(StringEvaluator): + """ + An evaluator that calculates the edit distance between JSON strings. + + This evaluator computes a normalized Damerau-Levenshtein distance between two JSON strings + after parsing them and converting them to a canonical format (i.e., whitespace and key order are normalized). + It can be customized with alternative distance and canonicalization functions. + + Args: + string_distance (Optional[Callable[[str, str], float]]): A callable that computes the distance between two strings. + If not provided, a Damerau-Levenshtein distance from the `rapidfuzz` package will be used. + canonicalize (Optional[Callable[[Any], Any]]): A callable that converts a parsed JSON object into its canonical string form. + If not provided, the default behavior is to serialize the JSON with sorted keys and no extra whitespace. + **kwargs (Any): Additional keyword arguments. + + Attributes: + _string_distance (Callable[[str, str], float]): The internal distance computation function. + _canonicalize (Callable[[Any], Any]): The internal canonicalization function. + + Examples: + >>> evaluator = JsonEditDistanceEvaluator() + >>> result = evaluator.evaluate_strings(prediction='{"a": 1, "b": 2}', reference='{"a": 1, "b": 3}') + >>> assert result["score"] is not None + + Raises: + ImportError: If `rapidfuzz` is not installed and no alternative `string_distance` function is provided. + + """ # noqa: E501 + + def __init__( + self, + string_distance: Optional[Callable[[str, str], float]] = None, + canonicalize: Optional[Callable[[Any], Any]] = None, + **kwargs: Any + ) -> None: + super().__init__() + if string_distance is not None: + self._string_distance = string_distance + else: + try: + from rapidfuzz import distance as rfd # noqa: F401 + except ImportError: + raise ImportError( + "The default string_distance operator for the " + " JsonEditDistanceEvaluator requires installation of " + "the rapidfuzz package. " + "Please install it with `pip install rapidfuzz`." + ) + self._string_distance = rfd.DamerauLevenshtein.normalized_distance + if canonicalize is not None: + self._canonicalize = canonicalize + else: + self._canonicalize = lambda x: json.dumps( + x, separators=(",", ":"), sort_keys=True # eliminate whitespace + ) + + @property + def requires_input(self) -> bool: + return False + + @property + def requires_reference(self) -> bool: + return True + + @property + def evaluation_name(self) -> str: + return "json_edit_distance" + + def _parse_json(self, node: Any) -> Union[dict, list, None, float, bool, int, str]: + if isinstance(node, str): + return parse_json_markdown(node) + return node + + def _evaluate_strings( + self, + prediction: str, + input: Optional[str] = None, + reference: Optional[str] = None, + **kwargs: Any + ) -> dict: + parsed = self._canonicalize(self._parse_json(prediction)) + label = self._canonicalize(self._parse_json(reference)) + distance = self._string_distance(parsed, label) + return {"score": distance} diff --git a/libs/langchain/langchain/evaluation/schema.py b/libs/langchain/langchain/evaluation/schema.py index 50b541f462..43e8ed92ed 100644 --- a/libs/langchain/langchain/evaluation/schema.py +++ b/libs/langchain/langchain/evaluation/schema.py @@ -64,6 +64,8 @@ class EvaluatorType(str, Enum): """Check if a prediction is valid JSON.""" JSON_EQUALITY = "json_equality" """Check if a prediction is equal to a reference JSON.""" + JSON_EDIT_DISTANCE = "json_edit_distance" + """Compute the edit distance between two JSON strings after canonicalization.""" class LLMEvalChain(Chain): diff --git a/libs/langchain/tests/unit_tests/evaluation/parsing/test_json_distance.py b/libs/langchain/tests/unit_tests/evaluation/parsing/test_json_distance.py new file mode 100644 index 0000000000..e0bee70ddc --- /dev/null +++ b/libs/langchain/tests/unit_tests/evaluation/parsing/test_json_distance.py @@ -0,0 +1,117 @@ +import pytest + +from langchain.evaluation.parsing.json_distance import JsonEditDistanceEvaluator + + +@pytest.fixture +def json_distance_evaluator() -> JsonEditDistanceEvaluator: + return JsonEditDistanceEvaluator() + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_requires_input( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + assert json_distance_evaluator.requires_input is False + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_requires_reference( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + assert json_distance_evaluator.requires_reference is True + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluation_name( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + assert json_distance_evaluator.evaluation_name == "json_edit_distance" + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_parse_json( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + string = '{"a": 1}' + result = json_distance_evaluator._parse_json(string) + assert result == {"a": 1} + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_simple_diff( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + prediction = '{"a": 1}' + reference = '{"a": 2}' + result = json_distance_evaluator._evaluate_strings( + prediction=prediction, reference=reference + ) + # Only 1 character flipped + pytest.approx(1 / 7, result["score"]) + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_complex_diff( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + prediction = '{"a":1, "b": {"c": 2, "d": 3}}' + reference = '{"a": 1, "b": {"c": 2, "d": 4}}' + result = json_distance_evaluator._evaluate_strings( + prediction=prediction, reference=reference + ) + # Only 1 character flipped + pytest.approx(1 / len(reference.replace(" ", "")), result["score"]) + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_list_diff( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 1, "b": 2}, {"a": 2, "b": 4}]' + result = json_distance_evaluator._evaluate_strings( + prediction=prediction, reference=reference + ) + # Again only 1 character flipped + pytest.approx(1 / len(reference.replace(" ", "")), result["score"]) + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_list_same( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"b": 2, "a": 1}, {"b": 3, "a": 2}]' + result = json_distance_evaluator._evaluate_strings( + prediction=prediction, reference=reference + ) + assert result["score"] == 0 + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_list_diff_length( + json_distance_evaluator: JsonEditDistanceEvaluator, +) -> None: + prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]' + reference = '[{"a": 1, "b": 2}]' + result = json_distance_evaluator._evaluate_strings( + prediction=prediction, reference=reference + ) + pytest.approx( + len('{"a":2,"b":3}') / len(reference.replace(" ", "")), result["score"] + ) + + +@pytest.mark.requires("rapidfuzz") +def test_json_distance_evaluator_evaluate_strings_custom_operator_equal() -> None: + """Custom operator that returns 0.5 if strings are different.""" + + def custom_distance(a: str, b: str) -> float: + return 0.5 if a != b else 0.0 + + evaluator = JsonEditDistanceEvaluator(string_distance=custom_distance) + prediction = '{"a": "apple", "b": "banana"}' + reference = '{"a": "apple", "b": "berries"}' + result = evaluator._evaluate_strings(prediction=prediction, reference=reference) + assert result["score"] == 0.5