|
|
|
@ -1,14 +1,63 @@
|
|
|
|
|
"""Interfaces to be implemented by general evaluators."""
|
|
|
|
|
from abc import abstractmethod
|
|
|
|
|
from typing import Any, Optional, Protocol, runtime_checkable
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
|
from typing import Any, Optional
|
|
|
|
|
from warnings import warn
|
|
|
|
|
|
|
|
|
|
@runtime_checkable
|
|
|
|
|
class StringEvaluator(Protocol):
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _EvalArgsMixin:
|
|
|
|
|
"""Mixin for checking evaluation arguments."""
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def requires_reference(self) -> bool:
|
|
|
|
|
"""Whether this evaluator requires a reference label."""
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def requires_input(self) -> bool:
|
|
|
|
|
"""Whether this evaluator requires an input string."""
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def _skip_input_warning(self) -> str:
|
|
|
|
|
"""Warning to show when input is ignored."""
|
|
|
|
|
return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def _skip_reference_warning(self) -> str:
|
|
|
|
|
"""Warning to show when reference is ignored."""
|
|
|
|
|
return (
|
|
|
|
|
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def _check_evaluation_args(
|
|
|
|
|
self,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
input: Optional[str] = None,
|
|
|
|
|
) -> None:
|
|
|
|
|
if self.requires_input and input is None:
|
|
|
|
|
raise ValueError(f"{self.__class__.__name__} requires an input string.")
|
|
|
|
|
elif input is not None and not self.requires_input:
|
|
|
|
|
warn(self._skip_input_warning)
|
|
|
|
|
else:
|
|
|
|
|
pass
|
|
|
|
|
if self.requires_reference and reference is None:
|
|
|
|
|
raise ValueError(f"{self.__class__.__name__} requires a reference string.")
|
|
|
|
|
elif reference is not None and not self.requires_reference:
|
|
|
|
|
warn(self._skip_reference_warning)
|
|
|
|
|
else:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StringEvaluator(_EvalArgsMixin, ABC):
|
|
|
|
|
"""Protocol for evaluating strings."""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def evaluate_strings(
|
|
|
|
|
def _evaluate_strings(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
@ -28,7 +77,7 @@ class StringEvaluator(Protocol):
|
|
|
|
|
dict: The evaluation results containing the score or value.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
async def aevaluate_strings(
|
|
|
|
|
async def _aevaluate_strings(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
@ -53,13 +102,61 @@ class StringEvaluator(Protocol):
|
|
|
|
|
"async aevaluate_strings method."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def evaluate_strings(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
input: Optional[str] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Evaluate Chain or LLM output, based on optional input and label.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
prediction (str): the LLM or chain prediction to evaluate.
|
|
|
|
|
reference (Optional[str], optional): the reference label
|
|
|
|
|
to evaluate against.
|
|
|
|
|
input (Optional[str], optional): the input to consider during evaluation
|
|
|
|
|
**kwargs: additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
|
Returns:
|
|
|
|
|
dict: The evaluation results containing the score or value.
|
|
|
|
|
"""
|
|
|
|
|
self._check_evaluation_args(reference=reference, input=input)
|
|
|
|
|
return self._evaluate_strings(
|
|
|
|
|
prediction=prediction, reference=reference, input=input, **kwargs
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def aevaluate_strings(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
input: Optional[str] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Asynchronously evaluate Chain or LLM output, based on optional
|
|
|
|
|
input and label.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
prediction (str): the LLM or chain prediction to evaluate.
|
|
|
|
|
reference (Optional[str], optional): the reference label
|
|
|
|
|
to evaluate against.
|
|
|
|
|
input (Optional[str], optional): the input to consider during evaluation
|
|
|
|
|
**kwargs: additional keyword arguments, including callbacks, tags, etc.
|
|
|
|
|
Returns:
|
|
|
|
|
dict: The evaluation results containing the score or value.
|
|
|
|
|
"""
|
|
|
|
|
self._check_evaluation_args(reference=reference, input=input)
|
|
|
|
|
return await self._aevaluate_strings(
|
|
|
|
|
prediction=prediction, reference=reference, input=input, **kwargs
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@runtime_checkable
|
|
|
|
|
class PairwiseStringEvaluator(Protocol):
|
|
|
|
|
|
|
|
|
|
class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
|
|
|
|
|
"""A protocol for comparing the output of two models."""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def evaluate_string_pairs(
|
|
|
|
|
def _evaluate_string_pairs(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
@ -84,8 +181,9 @@ class PairwiseStringEvaluator(Protocol):
|
|
|
|
|
other information.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
async def aevaluate_string_pairs(
|
|
|
|
|
async def _aevaluate_string_pairs(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
prediction_b: str,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
@ -111,3 +209,69 @@ class PairwiseStringEvaluator(Protocol):
|
|
|
|
|
f"{self.__class__.__name__} hasn't implemented an async "
|
|
|
|
|
"aevaluate_string_pairs method."
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def evaluate_string_pairs(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
prediction_b: str,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
input: Optional[str] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Evaluate the output string pairs.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
prediction (str): The output string from the first model.
|
|
|
|
|
prediction_b (str): The output string from the second model.
|
|
|
|
|
reference (str, optional): The expected output / reference
|
|
|
|
|
string. Defaults to None.
|
|
|
|
|
input (str, optional): The input string. Defaults to None.
|
|
|
|
|
**kwargs (Any): Additional keyword arguments, such
|
|
|
|
|
as callbacks and optional reference strings.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: A dictionary containing the preference, scores, and/or
|
|
|
|
|
other information.
|
|
|
|
|
"""
|
|
|
|
|
self._check_evaluation_args(reference=reference, input=input)
|
|
|
|
|
return self._evaluate_string_pairs(
|
|
|
|
|
prediction=prediction,
|
|
|
|
|
prediction_b=prediction_b,
|
|
|
|
|
reference=reference,
|
|
|
|
|
input=input,
|
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
async def aevaluate_string_pairs(
|
|
|
|
|
self,
|
|
|
|
|
*,
|
|
|
|
|
prediction: str,
|
|
|
|
|
prediction_b: str,
|
|
|
|
|
reference: Optional[str] = None,
|
|
|
|
|
input: Optional[str] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> dict:
|
|
|
|
|
"""Evaluate the output string pairs.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
prediction (str): The output string from the first model.
|
|
|
|
|
prediction_b (str): The output string from the second model.
|
|
|
|
|
reference (str, optional): The expected output / reference
|
|
|
|
|
string. Defaults to None.
|
|
|
|
|
input (str, optional): The input string. Defaults to None.
|
|
|
|
|
**kwargs (Any): Additional keyword arguments, such
|
|
|
|
|
as callbacks and optional reference strings.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: A dictionary containing the preference, scores, and/or
|
|
|
|
|
other information.
|
|
|
|
|
"""
|
|
|
|
|
self._check_evaluation_args(reference=reference, input=input)
|
|
|
|
|
return await self._aevaluate_string_pairs(
|
|
|
|
|
prediction=prediction,
|
|
|
|
|
prediction_b=prediction_b,
|
|
|
|
|
reference=reference,
|
|
|
|
|
input=input,
|
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|