mirror of https://github.com/hwchase17/langchain
Add Schema Evals (#9228)
Simple eval checks for whether a generation is valid json and whether it matches an expected dictpull/9287/head
parent
74a64cfbab
commit
2519580994
@ -0,0 +1,153 @@
|
||||
"""Evaluators for parsing strings."""
|
||||
from operator import eq
|
||||
from typing import Any, Callable, Optional, Union, cast
|
||||
|
||||
from langchain.evaluation.schema import StringEvaluator
|
||||
from langchain.output_parsers.json import parse_json_markdown
|
||||
|
||||
|
||||
class JsonValidityEvaluator(StringEvaluator):
|
||||
"""Evaluates whether the prediction is valid JSON.
|
||||
|
||||
This evaluator checks if the prediction is a valid JSON string. It does not
|
||||
require any input or reference.
|
||||
|
||||
Attributes:
|
||||
requires_input (bool): Whether this evaluator requires an input
|
||||
string. Always False.
|
||||
requires_reference (bool): Whether this evaluator requires a
|
||||
reference string. Always False.
|
||||
evaluation_name (str): The name of the evaluation metric.
|
||||
Always "json".
|
||||
|
||||
Examples:
|
||||
>>> evaluator = JsonValidityEvaluator()
|
||||
>>> prediction = '{"name": "John", "age": 30, "city": "New York"}'
|
||||
>>> evaluator.evaluate(prediction)
|
||||
{'score': 1}
|
||||
|
||||
>>> prediction = '{"name": "John", "age": 30, "city": "New York",}'
|
||||
>>> evaluator.evaluate(prediction)
|
||||
{'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes'}
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs: Any) -> None:
|
||||
super().__init__()
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "json_validity"
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
prediction: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
**kwargs: Any
|
||||
) -> dict:
|
||||
"""Evaluate the prediction string.
|
||||
|
||||
Args:
|
||||
prediction (str): The prediction string to evaluate.
|
||||
input (str, optional): Not used in this evaluator. Defaults to None.
|
||||
reference (str, optional): Not used in this evaluator. Defaults to None.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the evaluation score. The score is 1 if
|
||||
the prediction is valid JSON, and 0 otherwise.
|
||||
If the prediction is not valid JSON, the dictionary also contains
|
||||
a "reasoning" field with the error message.
|
||||
|
||||
"""
|
||||
try:
|
||||
parse_json_markdown(prediction)
|
||||
return {"score": 1}
|
||||
except Exception as e:
|
||||
return {"score": 0, "reasoning": str(e)}
|
||||
|
||||
|
||||
class JsonEqualityEvaluator(StringEvaluator):
|
||||
"""Evaluates whether the prediction is equal to the reference after
|
||||
parsing both as JSON.
|
||||
|
||||
This evaluator checks if the prediction, after parsing as JSON, is equal
|
||||
to the reference,
|
||||
which is also parsed as JSON. It does not require an input string.
|
||||
|
||||
Attributes:
|
||||
requires_input (bool): Whether this evaluator requires an
|
||||
input string. Always False.
|
||||
requires_reference (bool): Whether this evaluator requires
|
||||
a reference string. Always True.
|
||||
evaluation_name (str): The name of the evaluation metric.
|
||||
Always "parsed_equality".
|
||||
|
||||
Examples:
|
||||
>>> evaluator = JsonEqualityEvaluator()
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
|
||||
{'score': True}
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
|
||||
{'score': False}
|
||||
|
||||
>>> evaluator = JsonEqualityEvaluator(operator=lambda x, y: x['a'] == y['a'])
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
|
||||
{'score': True}
|
||||
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
|
||||
{'score': False}
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, operator: Optional[Callable] = None, **kwargs: Any) -> None:
|
||||
super().__init__()
|
||||
self.operator = operator or eq
|
||||
|
||||
@property
|
||||
def requires_input(self) -> bool:
|
||||
return False
|
||||
|
||||
@property
|
||||
def requires_reference(self) -> bool:
|
||||
return True
|
||||
|
||||
@property
|
||||
def evaluation_name(self) -> str:
|
||||
return "json_equality"
|
||||
|
||||
def _parse_json(
|
||||
self, string: str
|
||||
) -> Union[dict, list, None, float, bool, int, str]:
|
||||
return parse_json_markdown(string)
|
||||
|
||||
def _evaluate_strings(
|
||||
self,
|
||||
prediction: str,
|
||||
input: Optional[str] = None,
|
||||
reference: Optional[str] = None,
|
||||
**kwargs: Any
|
||||
) -> dict:
|
||||
"""Evaluate the prediction string.
|
||||
|
||||
Args:
|
||||
prediction (str): The prediction string to evaluate.
|
||||
input (str, optional): Not used in this evaluator.
|
||||
reference (str): The reference string to compare against.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the evaluation score.
|
||||
"""
|
||||
parsed = self._parse_json(prediction)
|
||||
label = self._parse_json(cast(str, reference))
|
||||
if isinstance(label, list):
|
||||
if not isinstance(parsed, list):
|
||||
return {"score": 0}
|
||||
parsed = sorted(parsed, key=lambda x: str(x))
|
||||
label = sorted(label, key=lambda x: str(x))
|
||||
return {"score": self.operator(parsed, label)}
|
@ -0,0 +1,177 @@
|
||||
import random
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.evaluation.parsing.base import (
|
||||
JsonEqualityEvaluator,
|
||||
JsonValidityEvaluator,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_validity_evaluator() -> JsonValidityEvaluator:
|
||||
return JsonValidityEvaluator()
|
||||
|
||||
|
||||
def test_json_validity_evaluator_requires_input(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.requires_input is False
|
||||
|
||||
|
||||
def test_json_validity_evaluator_requires_reference(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.requires_reference is False
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluation_name(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
assert json_validity_evaluator.evaluation_name == "json_validity"
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluate_valid_json(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"name": "John", "age": 30, "city": "New York"}'
|
||||
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
|
||||
assert result == {"score": 1}
|
||||
|
||||
|
||||
def test_json_validity_evaluator_evaluate_invalid_json(
|
||||
json_validity_evaluator: JsonValidityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"name": "John", "age": 30, "city": "New York",}'
|
||||
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
|
||||
assert result["score"] == 0
|
||||
assert result["reasoning"].startswith(
|
||||
"Expecting property name enclosed in double quotes"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def json_equality_evaluator() -> JsonEqualityEvaluator:
|
||||
return JsonEqualityEvaluator()
|
||||
|
||||
|
||||
def test_json_equality_evaluator_requires_input(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.requires_input is False
|
||||
|
||||
|
||||
def test_json_equality_evaluator_requires_reference(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.requires_reference is True
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluation_name(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
assert json_equality_evaluator.evaluation_name == "json_equality"
|
||||
|
||||
|
||||
def test_json_equality_evaluator_parse_json(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
string = '{"a": 1}'
|
||||
result = json_equality_evaluator._parse_json(string)
|
||||
assert result == {"a": 1}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_equal(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 1}'
|
||||
result = json_equality_evaluator.evaluate_strings(
|
||||
prediction=prediction, reference=reference
|
||||
)
|
||||
assert result == {"score": True}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_not_equal(
|
||||
json_equality_evaluator: JsonEqualityEvaluator,
|
||||
) -> None:
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 2}'
|
||||
result = json_equality_evaluator.evaluate_strings(
|
||||
prediction=prediction, reference=reference
|
||||
)
|
||||
assert result == {"score": False}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_custom_operator_equal() -> None:
|
||||
def operator(x: dict, y: dict) -> bool:
|
||||
return x["a"] == y["a"]
|
||||
|
||||
evaluator = JsonEqualityEvaluator(operator=operator)
|
||||
prediction = '{"a": 1, "b": 2}'
|
||||
reference = '{"a": 1, "c": 3}'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_strings_custom_operator_not_equal() -> None:
|
||||
def operator(x: dict, y: dict) -> bool:
|
||||
return x["a"] == y["a"]
|
||||
|
||||
evaluator = JsonEqualityEvaluator(operator=operator)
|
||||
prediction = '{"a": 1}'
|
||||
reference = '{"a": 2}'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
|
||||
def test_json_equality_evaluator_evaluate_lists_permutation_invariant() -> None:
|
||||
evaluator = JsonEqualityEvaluator()
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}, {"a": 3, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
||||
|
||||
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
|
||||
reference = '[{"a": 2, "b": 3}, {"b": 2,"a": 1}, {"a": 3, "b": 4}]'
|
||||
result = evaluator.evaluate_strings(prediction=reference, reference=prediction)
|
||||
assert result == {"score": False}
|
||||
|
||||
# Limit tests
|
||||
prediction = (
|
||||
"[" + ",".join([f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]) + "]"
|
||||
)
|
||||
rlist = [f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]
|
||||
random.shuffle(rlist)
|
||||
reference = "[" + ",".join(rlist) + "]"
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": True}
|
||||
|
||||
prediction = (
|
||||
"[" + ",".join([f'{{"b": {i+1}, "a": {i}}}' for i in range(1000)]) + "]"
|
||||
)
|
||||
reference = (
|
||||
"["
|
||||
+ ",".join(
|
||||
[f'{{"a": {i+1}, "b": {i+2}}}' for i in range(999)]
|
||||
+ ['{"a": 1000, "b": 1001}']
|
||||
)
|
||||
+ "]"
|
||||
)
|
||||
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
|
||||
assert result == {"score": False}
|
Loading…
Reference in New Issue