Load Run Evaluator (#7101)

Current problems:
1. Evaluating LLMs or Chat models isn't smooth. Even specifying
'generations' as the output inserts a redundant list into the eval
template
2. Configuring input / prediction / reference keys in the
`get_qa_evaluator` function is confusing. Unless you are using a chain
with the default keys, you have to specify all the variables and need to
reason about whether the key corresponds to the traced run's inputs,
outputs or the examples inputs or outputs.


Proposal:
- Configure the run evaluator according to a model. Use the model type
and input/output keys to assert compatibility where possible. Only need
to specify a reference_key for certain evaluators (which is less
confusing than specifying input keys)


When does this work:
- If you have your langchain model available (assumed always for
run_on_dataset flow)
- If you are evaluating an LLM, Chat model, or chain
- If the LLM or chat models are traced by langchain (wouldn't work if
you add an incompatible schema via the REST API)

When would this fail:
- Currently if you directly create an example from an LLM run, the
outputs are generations with all the extra metadata present. A simple
`example_key` and dumping all to the template could make the evaluations
unreliable
- Doesn't help if you're not using the low level API
- If you want to instantiate the evaluator without instantiating your
chain or LLM (maybe common for monitoring, for instance) -> could also
load from run or run type though

What's ugly:
- Personally think it's better to load evaluators one by one since
passing a config down is pretty confusing.
- Lots of testing needs to be added
- Inconsistent in that it makes a separate run and example input mapper
instead of the original `RunEvaluatorInputMapper`, which maps a run and
example to a single input.

Example usage running the for an LLM, Chat Model, and Agent.

```
# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

model = ChatOpenAI()
configured_evaluators = load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer")
run_on_dataset(ds_name, model, run_evaluators=configured_evaluators)
```


<details>
  <summary>Full code with dataset upload</summary>
```
## Create dataset
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.evaluation import load_dataset
import pandas as pd

lcds = load_dataset("llm-math")
df = pd.DataFrame(lcds)

from uuid import uuid4
from langsmith import Client
client = Client()
ds_name = "llm-math - " + str(uuid4())[0:8]
ds = client.upload_dataframe(df, name=ds_name, input_keys=["question"], output_keys=["answer"])



## Define the models we'll test over
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType

from langchain.tools import tool

llm = OpenAI(temperature=0)
chat_model = ChatOpenAI(temperature=0)

@tool
    def sum(a: float, b: float) -> float:
        """Add two numbers"""
        return a + b
    
def construct_agent():
    return initialize_agent(
        llm=chat_model,
        tools=[sum],
        agent=AgentType.OPENAI_MULTI_FUNCTIONS,
    )

agent = construct_agent()

# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

models = [llm, chat_model, agent]
run_evaluators = []
for model in models:
    run_evaluators.append(load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer"))
    

# Run on LLM, Chat Model, and Agent
from langchain.client.runner_utils import run_on_dataset

to_test = [llm, chat_model, construct_agent]

for model, configured_evaluators in zip(to_test, run_evaluators):
    run_on_dataset(ds_name, model, run_evaluators=configured_evaluators, verbose=True)
```
</details>

---------

Co-authored-by: Nuno Campos <nuno@boringbits.io>
pull/7123/head^2
William FH 1 year ago committed by GitHub
parent 1ac347b4e3
commit c5edbea34a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -99,6 +99,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
"""The parser to use to map the output to a structured result."""
criteria_names: List[str] = Field(default_factory=list)
"""The names of the criteria being evaluated."""
class Config:
"""Configuration for the QAEvalChain."""
@ -107,12 +109,24 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
@property
def requires_reference(self) -> bool:
"""Whether the evaluation requires a reference text."""
return "reference" in self.prompt.input_variables
@property
def requires_input(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
"""Get the name of the evaluation.
Returns
-------
str
The name of the evaluation.
"""
return " ".join(self.criteria_names)
@property
def _skip_reference_warning(self) -> str:
"""Warning to show when reference is ignored."""
@ -266,9 +280,15 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
)
criteria_ = cls.resolve_criteria(criteria)
criteria_names = list(criteria_.keys())
criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
prompt_ = prompt.partial(criteria=criteria_str)
return cls(llm=llm, prompt=prompt_, **kwargs)
return cls(
llm=llm,
prompt=prompt_,
criteria_names=criteria_names,
**kwargs,
)
def _get_eval_input(
self,

@ -67,9 +67,14 @@ def load_evaluator(
Examples
--------
>>> llm = ChatOpenAI(model="gpt-4", temperature=0)
>>> evaluator = load_evaluator(EvaluatorType.QA, llm=llm)
>>> evaluator = _load_evaluator("qa", llm=llm)
"""
llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
if evaluator not in _EVALUATOR_MAP:
raise ValueError(
f"Unknown evaluator type: {evaluator}"
f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
)
return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)

@ -49,6 +49,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
extra = Extra.ignore
@property
def evaluation_name(self) -> str:
return "correctness"
@property
def requires_reference(self) -> bool:
return True
@ -155,10 +159,12 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
@property
def requires_reference(self) -> bool:
"""Whether the chain requires a reference string."""
return True
@property
def requires_input(self) -> bool:
"""Whether the chain requires an input string."""
return True
@classmethod
@ -170,6 +176,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
f"but got {prompt.input_variables}"
)
@property
def evaluation_name(self) -> str:
return "Contextual Accuracy"
@classmethod
def from_llm(
cls,
@ -250,6 +260,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
class CotQAEvalChain(ContextQAEvalChain):
"""LLM Chain specifically for evaluating QA using chain of thought reasoning."""
@property
def evaluation_name(self) -> str:
return "COT Contextual Accuracy"
@classmethod
def from_llm(
cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any

@ -11,6 +11,9 @@ from langchain.evaluation.run_evaluators.implementations import (
get_qa_evaluator,
get_trajectory_evaluator,
)
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
__all__ = [
"RunEvaluatorChain",
@ -21,4 +24,5 @@ __all__ = [
"get_trajectory_evaluator",
"StringRunEvaluatorInputMapper",
"ChoicesOutputParser",
"StringRunEvaluatorChain",
]

@ -21,6 +21,10 @@ class RunEvaluatorInputMapper:
def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
"""Maps the Run and Optional[Example] to a dictionary"""
def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
"""Maps the Run and Optional[Example] to a dictionary"""
return self.map(run, example)
class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
"""Parse the output of a run."""

@ -0,0 +1,69 @@
""""Loading helpers for run evaluators."""
from typing import Any, List, Optional, Sequence, Union
from langchainplus_sdk import RunEvaluator
from langchain.base_language import BaseLanguageModel
from langchain.chains.base import Chain
from langchain.evaluation.loading import load_evaluators
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
from langchain.evaluation.schema import EvaluatorType, StringEvaluator
from langchain.tools.base import Tool
def load_run_evaluators_for_model(
evaluators: Sequence[EvaluatorType],
model: Union[Chain, BaseLanguageModel, Tool],
*,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
eval_llm: Optional[BaseLanguageModel] = None,
**kwargs: Any,
) -> List[RunEvaluator]:
"""Load evaluators specified by a list of evaluator types.
Parameters
----------
evaluators : Sequence[EvaluatorType]
The list of evaluator types to load.
model : Union[Chain, BaseLanguageModel, Tool]
The model to evaluate. Used to infer how to parse the run.
input_key : Optional[str], a chain run's input key to map
to the evaluator's input
prediction_key : Optional[str], the key in the run's outputs to
represent the Chain prediction
reference_key : Optional[str], the key in the dataset example (row)
outputs to represent the reference, or ground-truth label
eval_llm : BaseLanguageModel, optional
The language model to use for evaluation, if none is provided, a default
ChatOpenAI gpt-4 model will be used.
**kwargs : Any
Additional keyword arguments to pass to all evaluators.
Returns
-------
List[RunEvaluator]
The loaded Run evaluators.
"""
evaluators_ = load_evaluators(evaluators, llm=eval_llm, **kwargs)
run_evaluators = []
for evaluator in evaluators_:
if isinstance(evaluator, StringEvaluator):
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model,
evaluator,
input_key=input_key,
prediction_key=prediction_key,
reference_key=reference_key,
)
else:
raise NotImplementedError(
f"Run evaluator for {evaluator} is not implemented"
)
run_evaluators.append(run_evaluator)
return run_evaluators

@ -0,0 +1,385 @@
"""Run evaluator wrapper for string evaluators."""
from __future__ import annotations
from abc import abstractmethod
from typing import Any, Dict, List, Optional, Union
from langchainplus_sdk import EvaluationResult, RunEvaluator
from langchainplus_sdk.schemas import Example, Run
from langchain.base_language import BaseLanguageModel
from langchain.callbacks.manager import (
AsyncCallbackManagerForChainRun,
CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import StringEvaluator
from langchain.load.dump import dumps
from langchain.load.load import loads
from langchain.load.serializable import Serializable
from langchain.schema import RUN_KEY, messages_from_dict
from langchain.schema.messages import BaseMessage, get_buffer_string
from langchain.tools.base import Tool
def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
if not messages:
return []
first_message = messages[0]
if "lc" in first_message:
return [loads(dumps(message)) for message in messages]
else:
return messages_from_dict(messages)
class StringRunMapper(Serializable):
"""Extract items to evaluate from the run object."""
@property
def output_keys(self) -> List[str]:
"""The keys to extract from the run."""
return ["prediction", "input"]
@abstractmethod
def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
def __call__(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
if not run.outputs:
raise ValueError(f"Run {run.id} has no outputs to evaluate.")
return self.map(run)
class LLMStringRunMapper(StringRunMapper):
"""Extract items to evaluate from the run object."""
def serialize_chat_messages(self, messages: List[Dict]) -> str:
"""Extract the input messages from the run."""
if isinstance(messages, list) and messages:
if isinstance(messages[0], dict):
chat_messages = _get_messages_from_run_dict(messages)
elif isinstance(messages[0], list):
# Runs from Tracer have messages as a list of lists of dicts
chat_messages = _get_messages_from_run_dict(messages[0])
else:
raise ValueError(f"Could not extract messages to evaluate {messages}")
return get_buffer_string(chat_messages)
raise ValueError(f"Could not extract messages to evaluate {messages}")
def serialize_inputs(self, inputs: Dict) -> str:
if "prompts" in inputs: # Should we even accept this?
input_ = "\n\n".join(inputs["prompts"])
elif "prompt" in inputs:
input_ = inputs["prompt"]
elif "messages" in inputs:
input_ = self.serialize_chat_messages(inputs["messages"])
else:
raise ValueError("LLM Run must have either messages or prompts as inputs.")
return input_
def serialize_outputs(self, outputs: Dict) -> str:
if not outputs.get("generations"):
raise ValueError("Cannot evaluate LLM Run without generations.")
generations: List[Dict] = outputs["generations"]
if not generations:
raise ValueError("Cannot evaluate LLM run with empty generations.")
first_generation: Dict = generations[0]
if isinstance(first_generation, list):
# Runs from Tracer have generations as a list of lists of dicts
# Whereas Runs from the API have a list of dicts
first_generation = first_generation[0]
if "message" in first_generation:
output_ = self.serialize_chat_messages([first_generation["message"]])
else:
output_ = first_generation["text"]
return output_
def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
if run.run_type != "llm":
raise ValueError("LLM RunMapper only supports LLM runs.")
elif not run.outputs:
if run.error:
raise ValueError(
f"Cannot evaluate errored LLM run {run.id}: {run.error}"
)
else:
raise ValueError(
f"Run {run.id} has no outputs. Cannot evaluate this run."
)
else:
try:
inputs = self.serialize_inputs(run.inputs)
except Exception as e:
raise ValueError(
f"Could not parse LM input from run inputs {run.inputs}"
) from e
try:
output_ = self.serialize_outputs(run.outputs)
except Exception as e:
raise ValueError(
f"Could not parse LM prediction from run outputs {run.outputs}"
) from e
return {"input": inputs, "prediction": output_}
class ChainStringRunMapper(StringRunMapper):
"""Extract items to evaluate from the run object from a chain."""
input_key: str
"""The key from the model Run's inputs to use as the eval input."""
prediction_key: str
"""The key from the model Run's outputs to use as the eval prediction."""
@classmethod
def from_chain(
cls,
model: Chain,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
) -> ChainStringRunMapper:
"""Create a RunMapper from a chain."""
error_messages = []
if input_key is None:
if len(model.input_keys) > 1:
error_messages.append(
f"Chain {model.lc_namespace} has multiple input"
" keys. Please specify 'input_key' when loading."
)
else:
input_key = model.input_keys[0]
elif input_key not in model.input_keys:
error_messages.append(
f"Chain {model.lc_namespace} does not have specified"
f" input key {input_key}."
)
if prediction_key is None:
if len(model.output_keys) > 1:
error_messages.append(
f"Chain {model.lc_namespace} has multiple"
" output keys. Please specify 'prediction_key' when loading."
)
else:
prediction_key = model.output_keys[0]
elif prediction_key not in model.output_keys:
error_messages.append(
f"Chain {model.lc_namespace} does not have specified"
f" prediction_key {prediction_key}."
)
if error_messages:
raise ValueError("\n".join(error_messages))
if input_key is None or prediction_key is None:
# This should never happen, but mypy doesn't know that.
raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
return cls(input_key=input_key, prediction_key=prediction_key)
def map(self, run: Run) -> Dict[str, str]:
"""Maps the Run to a dictionary."""
if not run.outputs:
raise ValueError(f"Run {run.id} has no outputs to evaluate.")
if run.run_type != "chain":
raise ValueError("Chain RunMapper only supports Chain runs.")
if self.input_key not in run.inputs:
raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
elif self.prediction_key not in run.outputs:
raise ValueError(
f"Run {run.id} does not have prediction key {self.prediction_key}."
)
else:
return {
"input": run.inputs[self.input_key],
"prediction": run.outputs[self.prediction_key],
}
class ToolStringRunMapper(StringRunMapper):
"""Map an input to the tool."""
def map(self, run: Run) -> Dict[str, str]:
if not run.outputs:
raise ValueError(f"Run {run.id} has no outputs to evaluate.")
return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
class StringExampleMapper(Serializable):
"""Map an example, or row in the dataset, to the inputs of an evaluation."""
reference_key: Optional[str] = None
@property
def output_keys(self) -> List[str]:
"""The keys to extract from the run."""
return ["reference"]
def serialize_chat_messages(self, messages: List[Dict]) -> str:
"""Extract the input messages from the run."""
chat_messages = _get_messages_from_run_dict(messages)
return get_buffer_string(chat_messages)
def map(self, example: Example) -> Dict[str, str]:
"""Maps the Example, or dataset row to a dictionary."""
if not example.outputs:
raise ValueError(
f"Example {example.id} has no outputs to use as a reference."
)
if self.reference_key is None:
if len(example.outputs) > 1:
raise ValueError(
f"Example {example.id} has multiple outputs, so you must"
" specify a reference_key."
)
else:
output = list(example.outputs.values())[0]
return {
"reference": self.serialize_chat_messages([output])
if isinstance(output, dict)
and output.get("type")
and output.get("data")
else output
}
elif self.reference_key not in example.outputs:
raise ValueError(
f"Example {example.id} does not have reference key"
f" {self.reference_key}."
)
return {"reference": example.outputs[self.reference_key]}
def __call__(self, example: Example) -> Dict[str, str]:
"""Maps the Run and Example to a dictionary."""
if not example.outputs:
raise ValueError(
f"Example {example.id} has no outputs to use as areference label."
)
return self.map(example)
class StringRunEvaluatorChain(Chain, RunEvaluator):
"""Evaluate Run and optional examples."""
run_mapper: StringRunMapper
"""Maps the Run to a dictionary with 'input' and 'prediction' strings."""
example_mapper: Optional[StringExampleMapper] = None
"""Maps the Example (dataset row) to a dictionary
with a 'reference' string."""
name: str
"""The name of the evaluation metric."""
string_evaluator: StringEvaluator
"""The evaluation chain."""
@property
def input_keys(self) -> List[str]:
return ["run", "example"]
@property
def output_keys(self) -> List[str]:
return ["feedback"]
def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]:
run: Run = inputs["run"]
example: Optional[Example] = inputs.get("example")
evaluate_strings_inputs = self.run_mapper(run)
if example and self.example_mapper:
evaluate_strings_inputs.update(self.example_mapper(example))
elif self.string_evaluator.requires_reference:
raise ValueError(
f"Evaluator {self.name} requires an reference"
" example from the dataset,"
f" but none was provided for run {run.id}."
)
return evaluate_strings_inputs
def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
evaluation_result = EvaluationResult(key=self.name, **output)
if RUN_KEY in output:
# TODO: Not currently surfaced. Update
evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
return evaluation_result
def _call(
self,
inputs: Dict[str, str],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
"""Call the evaluation chain."""
evaluate_strings_inputs = self._prepare_input(inputs)
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()
chain_output = self.string_evaluator.evaluate_strings(
**evaluate_strings_inputs,
callbacks=callbacks,
)
evaluation_result = self._prepare_output(chain_output)
return {"feedback": evaluation_result}
async def _acall(
self,
inputs: Dict[str, str],
run_manager: AsyncCallbackManagerForChainRun | None = None,
) -> Dict[str, Any]:
"""Call the evaluation chain."""
evaluate_strings_inputs = self._prepare_input(inputs)
_run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
callbacks = _run_manager.get_child()
chain_output = await self.string_evaluator.aevaluate_strings(
**evaluate_strings_inputs,
callbacks=callbacks,
)
evaluation_result = self._prepare_output(chain_output)
return {"feedback": evaluation_result}
def evaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
return self({"run": run, "example": example})["feedback"]
async def aevaluate_run(
self, run: Run, example: Optional[Example] = None
) -> EvaluationResult:
"""Evaluate an example."""
result = await self.acall({"run": run, "example": example})
return result["feedback"]
@classmethod
def from_model_and_evaluator(
cls,
model: Union[Chain, BaseLanguageModel, Tool],
evaluator: StringEvaluator,
input_key: Optional[str] = None,
prediction_key: Optional[str] = None,
reference_key: Optional[str] = None,
) -> StringRunEvaluatorChain:
"""Create a StringRunEvaluatorChain from a model and evaluator."""
if isinstance(model, BaseLanguageModel):
run_mapper: StringRunMapper = LLMStringRunMapper()
elif isinstance(model, Chain):
run_mapper = ChainStringRunMapper.from_chain(
model, input_key=input_key, prediction_key=prediction_key
)
elif isinstance(model, Tool):
run_mapper = ToolStringRunMapper()
else:
raise NotImplementedError(
f"{cls.__name__}.from_model_and_evaluator({type(model)})"
" not yet implemented."
"Expected one of [BaseLanguageModel, Chain, Tool]."
)
if reference_key is not None or isinstance(model, BaseLanguageModel):
example_mapper = StringExampleMapper(reference_key=reference_key)
elif evaluator.requires_reference:
# We could potentially auto-infer if there is only one string in the
# example, but it's preferred to raise earlier.
raise ValueError(
f"Evaluator {evaluator.evaluation_name} requires a reference"
" example from the dataset. Please specify the reference key from"
" amongst the dataset outputs keys."
)
else:
example_mapper = None
return cls(
name=evaluator.evaluation_name,
run_mapper=run_mapper,
example_mapper=example_mapper,
string_evaluator=evaluator,
)

@ -91,6 +91,14 @@ class _EvalArgsMixin:
class StringEvaluator(_EvalArgsMixin, ABC):
"""Protocol for evaluating strings."""
@property
def evaluation_name(self) -> str:
raise NotImplementedError()
@property
def requires_reference(self) -> bool:
return False
@abstractmethod
def _evaluate_strings(
self,
@ -110,6 +118,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
**kwargs: additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
"""
async def _aevaluate_strings(
@ -131,6 +143,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
**kwargs: additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
"""
raise NotImplementedError(
f"{self.__class__.__name__} hasn't implemented an "

@ -168,7 +168,7 @@ def _message_from_dict(message: dict) -> BaseMessage:
elif _type == "chat":
return ChatMessage(**message["data"])
else:
raise ValueError(f"Got unexpected type: {_type}")
raise ValueError(f"Got unexpected message type: {_type}")
def messages_from_dict(messages: List[dict]) -> List[BaseMessage]:

@ -0,0 +1,81 @@
import sys
from typing import Iterator
from uuid import uuid4
import pytest
from langchainplus_sdk import LangChainPlusClient as Client
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.client.runner_utils import run_on_dataset
from langchain.evaluation import EvaluatorType
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.llms.openai import OpenAI
@pytest.fixture(
scope="module",
)
def dataset_name() -> Iterator[str]:
import pandas as pd
client = Client()
df = pd.DataFrame(
[
{"question": "5", "answer": 5.0},
{"question": "5 + 3", "answer": 8.0},
{"question": "2^3.171", "answer": 9.006708689094099},
{"question": " 2 ^3.171 ", "answer": 9.006708689094099},
]
)
uid = str(uuid4())[-8:]
_dataset_name = f"lcp integration tests - {uid}"
client.upload_dataframe(
df,
name=_dataset_name,
input_keys=["question"],
output_keys=["answer"],
description="Integration test dataset",
)
yield _dataset_name
def test_chat_model(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("CHAT", results, file=sys.stderr)
def test_llm(dataset_name: str) -> None:
llm = OpenAI(temperature=0)
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
llm,
run_evaluators=evaluators,
)
print("LLM", results, file=sys.stderr)
def test_chain(dataset_name: str) -> None:
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
evaluators = load_run_evaluators_for_model(
[EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
)
results = run_on_dataset(
dataset_name,
lambda: chain,
run_evaluators=evaluators,
)
print("CHAIN", results, file=sys.stderr)

@ -0,0 +1,114 @@
"""Test the loading function for evalutors."""
from unittest.mock import MagicMock
import pytest
from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
from langchain.evaluation.loading import load_evaluators
from langchain.evaluation.run_evaluators.string_run_evaluator import (
StringRunEvaluatorChain,
)
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.chains.test_base import FakeChain
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Foo output"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "Foo output"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Foo output"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
model = FakeChatModel()
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "generations"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, **kwargs
)
callback = RunCollectorCallbackHandler()
model.predict("Foo input", callbacks=[callback])
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"generations": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Human: Foo input"
assert result["prediction"] == "AI: fake response"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"
@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
model = FakeChain(
the_input_keys=["an_input", "another_input"],
)
fake_llm = FakeChatModel()
evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
# No input key
with pytest.raises(ValueError, match="multiple input keys"):
StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
with pytest.raises(ValueError, match="does not have specified"):
StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="some_input"
)
kwargs = {}
if evaluator.requires_reference:
kwargs["reference_key"] = "label_column"
run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
model, evaluator, input_key="an_input", **kwargs
)
callback = RunCollectorCallbackHandler()
model(
{"an_input": "Foo input", "another_input": "Another fake response"},
callbacks=[callback],
)
run = callback.traced_runs[0]
example = MagicMock()
example.inputs = {}
example.outputs = {"label_column": "Another fake response"}
result = run_evaluator._prepare_input({"run": run, "example": example})
assert result["input"] == "Foo input"
assert result["prediction"] == "baz"
if evaluator.requires_reference:
assert "reference" in result
assert result["reference"] == "Another fake response"

@ -3,7 +3,9 @@
import pytest
from langchain.evaluation.loading import EvaluatorType, load_evaluators
from langchain.evaluation.schema import StringEvaluator
from tests.unit_tests.llms.fake_chat_model import FakeChatModel
from tests.unit_tests.llms.fake_llm import FakeLLM
@pytest.mark.parametrize("evaluator_type", EvaluatorType)
@ -14,3 +16,16 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
# Test as string
load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore
def test_criteria_eval_chain_requires_reference() -> None:
"""Test loading evaluators."""
fake_llm = FakeLLM(
queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
)
evaluator = load_evaluators(
[EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
)[0]
if not isinstance(evaluator, StringEvaluator):
raise ValueError("Evaluator is not a string evaluator")
assert evaluator.requires_reference

Loading…
Cancel
Save