mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Override Keys Option (#13537)
Should be able to override the global key if you want to evaluate different outputs in a single run
This commit is contained in:
parent
e584b28c54
commit
5a28dc3210
@ -54,6 +54,26 @@ class EvalConfig(BaseModel):
|
|||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
class SingleKeyEvalConfig(EvalConfig):
|
||||||
|
reference_key: Optional[str] = None
|
||||||
|
"""The key in the dataset run to use as the reference string.
|
||||||
|
If not provided, we will attempt to infer automatically."""
|
||||||
|
prediction_key: Optional[str] = None
|
||||||
|
"""The key from the traced run's outputs dictionary to use to
|
||||||
|
represent the prediction. If not provided, it will be inferred
|
||||||
|
automatically."""
|
||||||
|
input_key: Optional[str] = None
|
||||||
|
"""The key from the traced run's inputs dictionary to use to represent the
|
||||||
|
input. If not provided, it will be inferred automatically."""
|
||||||
|
|
||||||
|
def get_kwargs(self) -> Dict[str, Any]:
|
||||||
|
kwargs = super().get_kwargs()
|
||||||
|
# Filer out the keys that are not needed for the evaluator.
|
||||||
|
for key in ["reference_key", "prediction_key", "input_key"]:
|
||||||
|
kwargs.pop(key, None)
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
class RunEvalConfig(BaseModel):
|
class RunEvalConfig(BaseModel):
|
||||||
"""Configuration for a run evaluation.
|
"""Configuration for a run evaluation.
|
||||||
|
|
||||||
@ -113,7 +133,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
class Criteria(EvalConfig):
|
class Criteria(SingleKeyEvalConfig):
|
||||||
"""Configuration for a reference-free criteria evaluator.
|
"""Configuration for a reference-free criteria evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -134,7 +154,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(criteria=criteria, **kwargs)
|
super().__init__(criteria=criteria, **kwargs)
|
||||||
|
|
||||||
class LabeledCriteria(EvalConfig):
|
class LabeledCriteria(SingleKeyEvalConfig):
|
||||||
"""Configuration for a labeled (with references) criteria evaluator.
|
"""Configuration for a labeled (with references) criteria evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -154,7 +174,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(criteria=criteria, **kwargs)
|
super().__init__(criteria=criteria, **kwargs)
|
||||||
|
|
||||||
class EmbeddingDistance(EvalConfig):
|
class EmbeddingDistance(SingleKeyEvalConfig):
|
||||||
"""Configuration for an embedding distance evaluator.
|
"""Configuration for an embedding distance evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -174,7 +194,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
class StringDistance(EvalConfig):
|
class StringDistance(SingleKeyEvalConfig):
|
||||||
"""Configuration for a string distance evaluator.
|
"""Configuration for a string distance evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -196,7 +216,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
"""Whether to normalize the distance to between 0 and 1.
|
"""Whether to normalize the distance to between 0 and 1.
|
||||||
Applies only to the Levenshtein and Damerau-Levenshtein distances."""
|
Applies only to the Levenshtein and Damerau-Levenshtein distances."""
|
||||||
|
|
||||||
class QA(EvalConfig):
|
class QA(SingleKeyEvalConfig):
|
||||||
"""Configuration for a QA evaluator.
|
"""Configuration for a QA evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -211,7 +231,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
llm: Optional[BaseLanguageModel] = None
|
llm: Optional[BaseLanguageModel] = None
|
||||||
prompt: Optional[BasePromptTemplate] = None
|
prompt: Optional[BasePromptTemplate] = None
|
||||||
|
|
||||||
class ContextQA(EvalConfig):
|
class ContextQA(SingleKeyEvalConfig):
|
||||||
"""Configuration for a context-based QA evaluator.
|
"""Configuration for a context-based QA evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -227,7 +247,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
llm: Optional[BaseLanguageModel] = None
|
llm: Optional[BaseLanguageModel] = None
|
||||||
prompt: Optional[BasePromptTemplate] = None
|
prompt: Optional[BasePromptTemplate] = None
|
||||||
|
|
||||||
class CoTQA(EvalConfig):
|
class CoTQA(SingleKeyEvalConfig):
|
||||||
"""Configuration for a context-based QA evaluator.
|
"""Configuration for a context-based QA evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -243,7 +263,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
llm: Optional[BaseLanguageModel] = None
|
llm: Optional[BaseLanguageModel] = None
|
||||||
prompt: Optional[BasePromptTemplate] = None
|
prompt: Optional[BasePromptTemplate] = None
|
||||||
|
|
||||||
class JsonValidity(EvalConfig):
|
class JsonValidity(SingleKeyEvalConfig):
|
||||||
"""Configuration for a json validity evaluator.
|
"""Configuration for a json validity evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -261,7 +281,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
|
|
||||||
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
evaluator_type: EvaluatorType = EvaluatorType.JSON_EQUALITY
|
||||||
|
|
||||||
class ExactMatch(EvalConfig):
|
class ExactMatch(SingleKeyEvalConfig):
|
||||||
"""Configuration for an exact match string evaluator.
|
"""Configuration for an exact match string evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -279,7 +299,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
ignore_punctuation: bool = False
|
ignore_punctuation: bool = False
|
||||||
ignore_numbers: bool = False
|
ignore_numbers: bool = False
|
||||||
|
|
||||||
class RegexMatch(EvalConfig):
|
class RegexMatch(SingleKeyEvalConfig):
|
||||||
"""Configuration for a regex match string evaluator.
|
"""Configuration for a regex match string evaluator.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@ -291,7 +311,7 @@ class RunEvalConfig(BaseModel):
|
|||||||
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
|
evaluator_type: EvaluatorType = EvaluatorType.REGEX_MATCH
|
||||||
flags: int = 0
|
flags: int = 0
|
||||||
|
|
||||||
class ScoreString(EvalConfig):
|
class ScoreString(SingleKeyEvalConfig):
|
||||||
"""Configuration for a score string evaluator.
|
"""Configuration for a score string evaluator.
|
||||||
This is like the criteria evaluator but it is configured by
|
This is like the criteria evaluator but it is configured by
|
||||||
default to return a score on the scale from 1-10.
|
default to return a score on the scale from 1-10.
|
||||||
|
@ -487,6 +487,11 @@ def _construct_run_evaluator(
|
|||||||
kwargs = {"llm": eval_llm, **eval_config.get_kwargs()}
|
kwargs = {"llm": eval_llm, **eval_config.get_kwargs()}
|
||||||
evaluator_ = load_evaluator(eval_config.evaluator_type, **kwargs)
|
evaluator_ = load_evaluator(eval_config.evaluator_type, **kwargs)
|
||||||
eval_type_tag = eval_config.evaluator_type.value
|
eval_type_tag = eval_config.evaluator_type.value
|
||||||
|
# Override keys if specified in the config
|
||||||
|
if isinstance(eval_config, smith_eval_config.SingleKeyEvalConfig):
|
||||||
|
input_key = eval_config.input_key or input_key
|
||||||
|
prediction_key = eval_config.prediction_key or prediction_key
|
||||||
|
reference_key = eval_config.reference_key or reference_key
|
||||||
|
|
||||||
if isinstance(evaluator_, StringEvaluator):
|
if isinstance(evaluator_, StringEvaluator):
|
||||||
if evaluator_.requires_reference and reference_key is None:
|
if evaluator_.requires_reference and reference_key is None:
|
||||||
|
Loading…
Reference in New Issue
Block a user