Add Schema Evals (#9228)

Simple eval checks for whether a generation is valid json and whether it
matches an expected dict
pull/9287/head
William FH 1 year ago committed by GitHub
parent 74a64cfbab
commit 2519580994
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -14,8 +14,12 @@ from langchain.evaluation.embedding_distance.base import (
EmbeddingDistanceEvalChain,
PairwiseEmbeddingDistanceEvalChain,
)
from langchain.evaluation.parsing.base import (
JsonEqualityEvaluator,
JsonValidityEvaluator,
)
from langchain.evaluation.qa import ContextQAEvalChain, CotQAEvalChain, QAEvalChain
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain
from langchain.evaluation.schema import EvaluatorType, LLMEvalChain, StringEvaluator
from langchain.evaluation.string_distance.base import (
PairwiseStringDistanceEvalChain,
StringDistanceEvalChain,
@ -57,7 +61,9 @@ def load_dataset(uri: str) -> List[Dict]:
return [d for d in dataset["train"]]
_EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
_EVALUATOR_MAP: Dict[
EvaluatorType, Union[Type[LLMEvalChain], Type[Chain], Type[StringEvaluator]]
] = {
EvaluatorType.QA: QAEvalChain,
EvaluatorType.COT_QA: CotQAEvalChain,
EvaluatorType.CONTEXT_QA: ContextQAEvalChain,
@ -70,6 +76,8 @@ _EVALUATOR_MAP: Dict[EvaluatorType, Union[Type[LLMEvalChain], Type[Chain]]] = {
EvaluatorType.PAIRWISE_STRING_DISTANCE: PairwiseStringDistanceEvalChain,
EvaluatorType.EMBEDDING_DISTANCE: EmbeddingDistanceEvalChain,
EvaluatorType.PAIRWISE_EMBEDDING_DISTANCE: PairwiseEmbeddingDistanceEvalChain,
EvaluatorType.JSON_VALIDITY: JsonValidityEvaluator,
EvaluatorType.JSON_EQUALITY: JsonEqualityEvaluator,
}
@ -78,7 +86,7 @@ def load_evaluator(
*,
llm: Optional[BaseLanguageModel] = None,
**kwargs: Any,
) -> Chain:
) -> Union[Chain, StringEvaluator]:
"""Load the requested evaluation chain specified by a string.
Parameters
@ -119,7 +127,7 @@ def load_evaluators(
llm: Optional[BaseLanguageModel] = None,
config: Optional[dict] = None,
**kwargs: Any,
) -> List[Chain]:
) -> List[Union[Chain, StringEvaluator]]:
"""Load evaluators specified by a list of evaluator types.
Parameters

@ -0,0 +1,153 @@
"""Evaluators for parsing strings."""
from operator import eq
from typing import Any, Callable, Optional, Union, cast
from langchain.evaluation.schema import StringEvaluator
from langchain.output_parsers.json import parse_json_markdown
class JsonValidityEvaluator(StringEvaluator):
"""Evaluates whether the prediction is valid JSON.
This evaluator checks if the prediction is a valid JSON string. It does not
require any input or reference.
Attributes:
requires_input (bool): Whether this evaluator requires an input
string. Always False.
requires_reference (bool): Whether this evaluator requires a
reference string. Always False.
evaluation_name (str): The name of the evaluation metric.
Always "json".
Examples:
>>> evaluator = JsonValidityEvaluator()
>>> prediction = '{"name": "John", "age": 30, "city": "New York"}'
>>> evaluator.evaluate(prediction)
{'score': 1}
>>> prediction = '{"name": "John", "age": 30, "city": "New York",}'
>>> evaluator.evaluate(prediction)
{'score': 0, 'reasoning': 'Expecting property name enclosed in double quotes'}
"""
def __init__(self, **kwargs: Any) -> None:
super().__init__()
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return False
@property
def evaluation_name(self) -> str:
return "json_validity"
def _evaluate_strings(
self,
prediction: str,
input: Optional[str] = None,
reference: Optional[str] = None,
**kwargs: Any
) -> dict:
"""Evaluate the prediction string.
Args:
prediction (str): The prediction string to evaluate.
input (str, optional): Not used in this evaluator. Defaults to None.
reference (str, optional): Not used in this evaluator. Defaults to None.
Returns:
dict: A dictionary containing the evaluation score. The score is 1 if
the prediction is valid JSON, and 0 otherwise.
If the prediction is not valid JSON, the dictionary also contains
a "reasoning" field with the error message.
"""
try:
parse_json_markdown(prediction)
return {"score": 1}
except Exception as e:
return {"score": 0, "reasoning": str(e)}
class JsonEqualityEvaluator(StringEvaluator):
"""Evaluates whether the prediction is equal to the reference after
parsing both as JSON.
This evaluator checks if the prediction, after parsing as JSON, is equal
to the reference,
which is also parsed as JSON. It does not require an input string.
Attributes:
requires_input (bool): Whether this evaluator requires an
input string. Always False.
requires_reference (bool): Whether this evaluator requires
a reference string. Always True.
evaluation_name (str): The name of the evaluation metric.
Always "parsed_equality".
Examples:
>>> evaluator = JsonEqualityEvaluator()
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
{'score': True}
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
{'score': False}
>>> evaluator = JsonEqualityEvaluator(operator=lambda x, y: x['a'] == y['a'])
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 1}')
{'score': True}
>>> evaluator.evaluate_strings('{"a": 1}', reference='{"a": 2}')
{'score': False}
"""
def __init__(self, operator: Optional[Callable] = None, **kwargs: Any) -> None:
super().__init__()
self.operator = operator or eq
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "json_equality"
def _parse_json(
self, string: str
) -> Union[dict, list, None, float, bool, int, str]:
return parse_json_markdown(string)
def _evaluate_strings(
self,
prediction: str,
input: Optional[str] = None,
reference: Optional[str] = None,
**kwargs: Any
) -> dict:
"""Evaluate the prediction string.
Args:
prediction (str): The prediction string to evaluate.
input (str, optional): Not used in this evaluator.
reference (str): The reference string to compare against.
Returns:
dict: A dictionary containing the evaluation score.
"""
parsed = self._parse_json(prediction)
label = self._parse_json(cast(str, reference))
if isinstance(label, list):
if not isinstance(parsed, list):
return {"score": 0}
parsed = sorted(parsed, key=lambda x: str(x))
label = sorted(label, key=lambda x: str(x))
return {"score": self.operator(parsed, label)}

@ -1,9 +1,11 @@
"""Interfaces to be implemented by general evaluators."""
from __future__ import annotations
import asyncio
import logging
from abc import ABC, abstractmethod
from enum import Enum
from functools import partial
from typing import Any, Optional, Sequence, Tuple
from warnings import warn
@ -48,6 +50,10 @@ class EvaluatorType(str, Enum):
"""Compare a prediction to a reference label using embedding distance."""
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
"""Compare two predictions using embedding distance."""
JSON_VALIDITY = "json_validity"
"""Check if a prediction is valid JSON."""
JSON_EQUALITY = "json_equality"
"""Check if a prediction is equal to a reference JSON."""
class LLMEvalChain(Chain):
@ -115,7 +121,7 @@ class StringEvaluator(_EvalArgsMixin, ABC):
@property
def evaluation_name(self) -> str:
"""The name of the evaluation."""
raise NotImplementedError()
return self.__class__.__name__
@property
def requires_reference(self) -> bool:
@ -168,9 +174,15 @@ class StringEvaluator(_EvalArgsMixin, ABC):
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
""" # noqa: E501
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_strings method."
return await asyncio.get_running_loop().run_in_executor(
None,
partial(
self._evaluate_strings,
prediction=prediction,
reference=reference,
input=input,
**kwargs,
),
)
def evaluate_strings(
@ -265,9 +277,16 @@ class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
Returns:
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_string_pairs method."
return await asyncio.get_running_loop().run_in_executor(
None,
partial(
self._evaluate_string_pairs,
prediction=prediction,
prediction_b=prediction_b,
reference=reference,
input=input,
**kwargs,
),
)
def evaluate_string_pairs(
@ -381,9 +400,16 @@ class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
Returns:
dict: The evaluation result.
"""
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an async "
"aevaluate_agent_trajectory method."
return await asyncio.get_running_loop().run_in_executor(
None,
partial(
self._evaluate_agent_trajectory,
prediction=prediction,
agent_trajectory=agent_trajectory,
reference=reference,
input=input,
**kwargs,
),
)
def evaluate_agent_trajectory(

@ -83,7 +83,9 @@ class RunEvalConfig(BaseModel):
The language model to pass to any evaluators that use a language model.
""" # noqa: E501
evaluators: List[Union[EvaluatorType, EvalConfig]] = Field(default_factory=list)
evaluators: List[Union[EvaluatorType, str, EvalConfig]] = Field(
default_factory=list
)
"""Configurations for which evaluators to apply to the dataset run.
Each can be the string of an
:class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such
@ -239,4 +241,22 @@ class RunEvalConfig(BaseModel):
llm: Optional[BaseLanguageModel] = None
prompt: Optional[BasePromptTemplate] = None
class JsonValidity(EvalConfig):
"""Configuration for a json validity evaluator.
Parameters
----------
"""
evaluator_type: EvaluatorType = EvaluatorType.JSON_VALIDITY
class JsonEqualityEvaluator(EvalConfig):
"""Configuration for a json equality evaluator.
Parameters
----------
"""
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
# TODO: Trajectory

@ -462,7 +462,7 @@ def _determine_reference_key(
def _construct_run_evaluator(
eval_config: Union[EvaluatorType, EvalConfig],
eval_config: Union[EvaluatorType, str, EvalConfig],
eval_llm: BaseLanguageModel,
run_type: str,
data_type: DataType,
@ -471,7 +471,9 @@ def _construct_run_evaluator(
input_key: Optional[str],
prediction_key: Optional[str],
) -> RunEvaluator:
if isinstance(eval_config, EvaluatorType):
if isinstance(eval_config, (EvaluatorType, str)):
if not isinstance(eval_config, EvaluatorType):
eval_config = EvaluatorType(eval_config)
evaluator_ = load_evaluator(eval_config, llm=eval_llm)
eval_type_tag = eval_config.value
else:
@ -1310,7 +1312,7 @@ def _handle_coroutine(coro: Coroutine) -> Any:
except RuntimeError: # No event loop
return asyncio.run(coro)
if loop.is_running():
return loop.create_task(coro)
return loop.run_until_complete(coro)
else:
return asyncio.run(coro)

@ -0,0 +1,177 @@
import random
import pytest
from langchain.evaluation.parsing.base import (
JsonEqualityEvaluator,
JsonValidityEvaluator,
)
@pytest.fixture
def json_validity_evaluator() -> JsonValidityEvaluator:
return JsonValidityEvaluator()
def test_json_validity_evaluator_requires_input(
json_validity_evaluator: JsonValidityEvaluator,
) -> None:
assert json_validity_evaluator.requires_input is False
def test_json_validity_evaluator_requires_reference(
json_validity_evaluator: JsonValidityEvaluator,
) -> None:
assert json_validity_evaluator.requires_reference is False
def test_json_validity_evaluator_evaluation_name(
json_validity_evaluator: JsonValidityEvaluator,
) -> None:
assert json_validity_evaluator.evaluation_name == "json_validity"
def test_json_validity_evaluator_evaluate_valid_json(
json_validity_evaluator: JsonValidityEvaluator,
) -> None:
prediction = '{"name": "John", "age": 30, "city": "New York"}'
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
assert result == {"score": 1}
def test_json_validity_evaluator_evaluate_invalid_json(
json_validity_evaluator: JsonValidityEvaluator,
) -> None:
prediction = '{"name": "John", "age": 30, "city": "New York",}'
result = json_validity_evaluator.evaluate_strings(prediction=prediction)
assert result["score"] == 0
assert result["reasoning"].startswith(
"Expecting property name enclosed in double quotes"
)
@pytest.fixture
def json_equality_evaluator() -> JsonEqualityEvaluator:
return JsonEqualityEvaluator()
def test_json_equality_evaluator_requires_input(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
assert json_equality_evaluator.requires_input is False
def test_json_equality_evaluator_requires_reference(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
assert json_equality_evaluator.requires_reference is True
def test_json_equality_evaluator_evaluation_name(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
assert json_equality_evaluator.evaluation_name == "json_equality"
def test_json_equality_evaluator_parse_json(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
string = '{"a": 1}'
result = json_equality_evaluator._parse_json(string)
assert result == {"a": 1}
def test_json_equality_evaluator_evaluate_strings_equal(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
prediction = '{"a": 1}'
reference = '{"a": 1}'
result = json_equality_evaluator.evaluate_strings(
prediction=prediction, reference=reference
)
assert result == {"score": True}
def test_json_equality_evaluator_evaluate_strings_not_equal(
json_equality_evaluator: JsonEqualityEvaluator,
) -> None:
prediction = '{"a": 1}'
reference = '{"a": 2}'
result = json_equality_evaluator.evaluate_strings(
prediction=prediction, reference=reference
)
assert result == {"score": False}
def test_json_equality_evaluator_evaluate_strings_custom_operator_equal() -> None:
def operator(x: dict, y: dict) -> bool:
return x["a"] == y["a"]
evaluator = JsonEqualityEvaluator(operator=operator)
prediction = '{"a": 1, "b": 2}'
reference = '{"a": 1, "c": 3}'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": True}
def test_json_equality_evaluator_evaluate_strings_custom_operator_not_equal() -> None:
def operator(x: dict, y: dict) -> bool:
return x["a"] == y["a"]
evaluator = JsonEqualityEvaluator(operator=operator)
prediction = '{"a": 1}'
reference = '{"a": 2}'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": False}
def test_json_equality_evaluator_evaluate_lists_permutation_invariant() -> None:
evaluator = JsonEqualityEvaluator()
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}]'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": True}
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 4}]'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": False}
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
reference = '[{"a": 2, "b": 3}]'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": False}
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
reference = '[{"a": 2, "b": 3}, {"a": 1, "b": 2}, {"a": 3, "b": 4}]'
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": False}
prediction = '[{"a": 1, "b": 2}, {"a": 2, "b": 3}]'
reference = '[{"a": 2, "b": 3}, {"b": 2,"a": 1}, {"a": 3, "b": 4}]'
result = evaluator.evaluate_strings(prediction=reference, reference=prediction)
assert result == {"score": False}
# Limit tests
prediction = (
"[" + ",".join([f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]) + "]"
)
rlist = [f'{{"a": {i}, "b": {i+1}}}' for i in range(1000)]
random.shuffle(rlist)
reference = "[" + ",".join(rlist) + "]"
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": True}
prediction = (
"[" + ",".join([f'{{"b": {i+1}, "a": {i}}}' for i in range(1000)]) + "]"
)
reference = (
"["
+ ",".join(
[f'{{"a": {i+1}, "b": {i+2}}}' for i in range(999)]
+ ['{"a": 1000, "b": 1001}']
)
+ "]"
)
result = evaluator.evaluate_strings(prediction=prediction, reference=reference)
assert result == {"score": False}

@ -40,6 +40,7 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
EvaluatorType.LABELED_CRITERIA,
EvaluatorType.LABELED_PAIRWISE_STRING,
],
[EvaluatorType.JSON_EQUALITY],
],
)
def test_eval_chain_requires_references(evaluator_types: List[EvaluatorType]) -> None:

Loading…
Cancel
Save