diff --git a/langchain/evaluation/criteria/eval_chain.py b/langchain/evaluation/criteria/eval_chain.py index ea6d9119f2..ef17f51f7d 100644 --- a/langchain/evaluation/criteria/eval_chain.py +++ b/langchain/evaluation/criteria/eval_chain.py @@ -99,6 +99,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain): output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser) """The parser to use to map the output to a structured result.""" + criteria_names: List[str] = Field(default_factory=list) + """The names of the criteria being evaluated.""" class Config: """Configuration for the QAEvalChain.""" @@ -107,12 +109,24 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain): @property def requires_reference(self) -> bool: + """Whether the evaluation requires a reference text.""" return "reference" in self.prompt.input_variables @property def requires_input(self) -> bool: return True + @property + def evaluation_name(self) -> str: + """Get the name of the evaluation. + + Returns + ------- + str + The name of the evaluation. + """ + return " ".join(self.criteria_names) + @property def _skip_reference_warning(self) -> str: """Warning to show when reference is ignored.""" @@ -266,9 +280,15 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain): ) criteria_ = cls.resolve_criteria(criteria) + criteria_names = list(criteria_.keys()) criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items()) prompt_ = prompt.partial(criteria=criteria_str) - return cls(llm=llm, prompt=prompt_, **kwargs) + return cls( + llm=llm, + prompt=prompt_, + criteria_names=criteria_names, + **kwargs, + ) def _get_eval_input( self, diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py index 549b9d6cb5..e65f1b8fa6 100644 --- a/langchain/evaluation/loading.py +++ b/langchain/evaluation/loading.py @@ -67,9 +67,14 @@ def load_evaluator( Examples -------- >>> llm = ChatOpenAI(model="gpt-4", temperature=0) - >>> evaluator = load_evaluator(EvaluatorType.QA, llm=llm) + >>> evaluator = _load_evaluator("qa", llm=llm) """ llm = llm or ChatOpenAI(model="gpt-4", temperature=0) + if evaluator not in _EVALUATOR_MAP: + raise ValueError( + f"Unknown evaluator type: {evaluator}" + f"Valid types are: {list(_EVALUATOR_MAP.keys())}" + ) return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs) diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py index cbcb564eae..8f658f9606 100644 --- a/langchain/evaluation/qa/eval_chain.py +++ b/langchain/evaluation/qa/eval_chain.py @@ -49,6 +49,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain): extra = Extra.ignore + @property + def evaluation_name(self) -> str: + return "correctness" + @property def requires_reference(self) -> bool: return True @@ -155,10 +159,12 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain): @property def requires_reference(self) -> bool: + """Whether the chain requires a reference string.""" return True @property def requires_input(self) -> bool: + """Whether the chain requires an input string.""" return True @classmethod @@ -170,6 +176,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain): f"but got {prompt.input_variables}" ) + @property + def evaluation_name(self) -> str: + return "Contextual Accuracy" + @classmethod def from_llm( cls, @@ -250,6 +260,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain): class CotQAEvalChain(ContextQAEvalChain): """LLM Chain specifically for evaluating QA using chain of thought reasoning.""" + @property + def evaluation_name(self) -> str: + return "COT Contextual Accuracy" + @classmethod def from_llm( cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any diff --git a/langchain/evaluation/run_evaluators/__init__.py b/langchain/evaluation/run_evaluators/__init__.py index 5aba3e638a..c9ecb4e279 100644 --- a/langchain/evaluation/run_evaluators/__init__.py +++ b/langchain/evaluation/run_evaluators/__init__.py @@ -11,6 +11,9 @@ from langchain.evaluation.run_evaluators.implementations import ( get_qa_evaluator, get_trajectory_evaluator, ) +from langchain.evaluation.run_evaluators.string_run_evaluator import ( + StringRunEvaluatorChain, +) __all__ = [ "RunEvaluatorChain", @@ -21,4 +24,5 @@ __all__ = [ "get_trajectory_evaluator", "StringRunEvaluatorInputMapper", "ChoicesOutputParser", + "StringRunEvaluatorChain", ] diff --git a/langchain/evaluation/run_evaluators/base.py b/langchain/evaluation/run_evaluators/base.py index b640719c02..dfa90f2d80 100644 --- a/langchain/evaluation/run_evaluators/base.py +++ b/langchain/evaluation/run_evaluators/base.py @@ -21,6 +21,10 @@ class RunEvaluatorInputMapper: def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]: """Maps the Run and Optional[Example] to a dictionary""" + def __call__(self, run: Run, example: Optional[Example] = None) -> Any: + """Maps the Run and Optional[Example] to a dictionary""" + return self.map(run, example) + class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]): """Parse the output of a run.""" diff --git a/langchain/evaluation/run_evaluators/loading.py b/langchain/evaluation/run_evaluators/loading.py new file mode 100644 index 0000000000..25e8d8fb22 --- /dev/null +++ b/langchain/evaluation/run_evaluators/loading.py @@ -0,0 +1,69 @@ +""""Loading helpers for run evaluators.""" + + +from typing import Any, List, Optional, Sequence, Union + +from langchainplus_sdk import RunEvaluator + +from langchain.base_language import BaseLanguageModel +from langchain.chains.base import Chain +from langchain.evaluation.loading import load_evaluators +from langchain.evaluation.run_evaluators.string_run_evaluator import ( + StringRunEvaluatorChain, +) +from langchain.evaluation.schema import EvaluatorType, StringEvaluator +from langchain.tools.base import Tool + + +def load_run_evaluators_for_model( + evaluators: Sequence[EvaluatorType], + model: Union[Chain, BaseLanguageModel, Tool], + *, + input_key: Optional[str] = None, + prediction_key: Optional[str] = None, + reference_key: Optional[str] = None, + eval_llm: Optional[BaseLanguageModel] = None, + **kwargs: Any, +) -> List[RunEvaluator]: + """Load evaluators specified by a list of evaluator types. + + Parameters + ---------- + evaluators : Sequence[EvaluatorType] + The list of evaluator types to load. + model : Union[Chain, BaseLanguageModel, Tool] + The model to evaluate. Used to infer how to parse the run. + input_key : Optional[str], a chain run's input key to map + to the evaluator's input + prediction_key : Optional[str], the key in the run's outputs to + represent the Chain prediction + reference_key : Optional[str], the key in the dataset example (row) + outputs to represent the reference, or ground-truth label + eval_llm : BaseLanguageModel, optional + The language model to use for evaluation, if none is provided, a default + ChatOpenAI gpt-4 model will be used. + **kwargs : Any + Additional keyword arguments to pass to all evaluators. + + Returns + ------- + List[RunEvaluator] + The loaded Run evaluators. + """ + evaluators_ = load_evaluators(evaluators, llm=eval_llm, **kwargs) + run_evaluators = [] + for evaluator in evaluators_: + if isinstance(evaluator, StringEvaluator): + run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator( + model, + evaluator, + input_key=input_key, + prediction_key=prediction_key, + reference_key=reference_key, + ) + else: + raise NotImplementedError( + f"Run evaluator for {evaluator} is not implemented" + ) + run_evaluators.append(run_evaluator) + return run_evaluators diff --git a/langchain/evaluation/run_evaluators/string_run_evaluator.py b/langchain/evaluation/run_evaluators/string_run_evaluator.py new file mode 100644 index 0000000000..201c0dff1b --- /dev/null +++ b/langchain/evaluation/run_evaluators/string_run_evaluator.py @@ -0,0 +1,385 @@ +"""Run evaluator wrapper for string evaluators.""" +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Union + +from langchainplus_sdk import EvaluationResult, RunEvaluator +from langchainplus_sdk.schemas import Example, Run + +from langchain.base_language import BaseLanguageModel +from langchain.callbacks.manager import ( + AsyncCallbackManagerForChainRun, + CallbackManagerForChainRun, +) +from langchain.chains.base import Chain +from langchain.evaluation.schema import StringEvaluator +from langchain.load.dump import dumps +from langchain.load.load import loads +from langchain.load.serializable import Serializable +from langchain.schema import RUN_KEY, messages_from_dict +from langchain.schema.messages import BaseMessage, get_buffer_string +from langchain.tools.base import Tool + + +def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]: + if not messages: + return [] + first_message = messages[0] + if "lc" in first_message: + return [loads(dumps(message)) for message in messages] + else: + return messages_from_dict(messages) + + +class StringRunMapper(Serializable): + """Extract items to evaluate from the run object.""" + + @property + def output_keys(self) -> List[str]: + """The keys to extract from the run.""" + return ["prediction", "input"] + + @abstractmethod + def map(self, run: Run) -> Dict[str, str]: + """Maps the Run to a dictionary.""" + + def __call__(self, run: Run) -> Dict[str, str]: + """Maps the Run to a dictionary.""" + if not run.outputs: + raise ValueError(f"Run {run.id} has no outputs to evaluate.") + return self.map(run) + + +class LLMStringRunMapper(StringRunMapper): + """Extract items to evaluate from the run object.""" + + def serialize_chat_messages(self, messages: List[Dict]) -> str: + """Extract the input messages from the run.""" + if isinstance(messages, list) and messages: + if isinstance(messages[0], dict): + chat_messages = _get_messages_from_run_dict(messages) + elif isinstance(messages[0], list): + # Runs from Tracer have messages as a list of lists of dicts + chat_messages = _get_messages_from_run_dict(messages[0]) + else: + raise ValueError(f"Could not extract messages to evaluate {messages}") + return get_buffer_string(chat_messages) + raise ValueError(f"Could not extract messages to evaluate {messages}") + + def serialize_inputs(self, inputs: Dict) -> str: + if "prompts" in inputs: # Should we even accept this? + input_ = "\n\n".join(inputs["prompts"]) + elif "prompt" in inputs: + input_ = inputs["prompt"] + elif "messages" in inputs: + input_ = self.serialize_chat_messages(inputs["messages"]) + else: + raise ValueError("LLM Run must have either messages or prompts as inputs.") + return input_ + + def serialize_outputs(self, outputs: Dict) -> str: + if not outputs.get("generations"): + raise ValueError("Cannot evaluate LLM Run without generations.") + generations: List[Dict] = outputs["generations"] + if not generations: + raise ValueError("Cannot evaluate LLM run with empty generations.") + first_generation: Dict = generations[0] + if isinstance(first_generation, list): + # Runs from Tracer have generations as a list of lists of dicts + # Whereas Runs from the API have a list of dicts + first_generation = first_generation[0] + if "message" in first_generation: + output_ = self.serialize_chat_messages([first_generation["message"]]) + else: + output_ = first_generation["text"] + return output_ + + def map(self, run: Run) -> Dict[str, str]: + """Maps the Run to a dictionary.""" + if run.run_type != "llm": + raise ValueError("LLM RunMapper only supports LLM runs.") + elif not run.outputs: + if run.error: + raise ValueError( + f"Cannot evaluate errored LLM run {run.id}: {run.error}" + ) + else: + raise ValueError( + f"Run {run.id} has no outputs. Cannot evaluate this run." + ) + else: + try: + inputs = self.serialize_inputs(run.inputs) + except Exception as e: + raise ValueError( + f"Could not parse LM input from run inputs {run.inputs}" + ) from e + try: + output_ = self.serialize_outputs(run.outputs) + except Exception as e: + raise ValueError( + f"Could not parse LM prediction from run outputs {run.outputs}" + ) from e + return {"input": inputs, "prediction": output_} + + +class ChainStringRunMapper(StringRunMapper): + """Extract items to evaluate from the run object from a chain.""" + + input_key: str + """The key from the model Run's inputs to use as the eval input.""" + prediction_key: str + """The key from the model Run's outputs to use as the eval prediction.""" + + @classmethod + def from_chain( + cls, + model: Chain, + input_key: Optional[str] = None, + prediction_key: Optional[str] = None, + ) -> ChainStringRunMapper: + """Create a RunMapper from a chain.""" + error_messages = [] + if input_key is None: + if len(model.input_keys) > 1: + error_messages.append( + f"Chain {model.lc_namespace} has multiple input" + " keys. Please specify 'input_key' when loading." + ) + else: + input_key = model.input_keys[0] + elif input_key not in model.input_keys: + error_messages.append( + f"Chain {model.lc_namespace} does not have specified" + f" input key {input_key}." + ) + if prediction_key is None: + if len(model.output_keys) > 1: + error_messages.append( + f"Chain {model.lc_namespace} has multiple" + " output keys. Please specify 'prediction_key' when loading." + ) + else: + prediction_key = model.output_keys[0] + elif prediction_key not in model.output_keys: + error_messages.append( + f"Chain {model.lc_namespace} does not have specified" + f" prediction_key {prediction_key}." + ) + if error_messages: + raise ValueError("\n".join(error_messages)) + if input_key is None or prediction_key is None: + # This should never happen, but mypy doesn't know that. + raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.") + return cls(input_key=input_key, prediction_key=prediction_key) + + def map(self, run: Run) -> Dict[str, str]: + """Maps the Run to a dictionary.""" + if not run.outputs: + raise ValueError(f"Run {run.id} has no outputs to evaluate.") + if run.run_type != "chain": + raise ValueError("Chain RunMapper only supports Chain runs.") + if self.input_key not in run.inputs: + raise ValueError(f"Run {run.id} does not have input key {self.input_key}.") + elif self.prediction_key not in run.outputs: + raise ValueError( + f"Run {run.id} does not have prediction key {self.prediction_key}." + ) + else: + return { + "input": run.inputs[self.input_key], + "prediction": run.outputs[self.prediction_key], + } + + +class ToolStringRunMapper(StringRunMapper): + """Map an input to the tool.""" + + def map(self, run: Run) -> Dict[str, str]: + if not run.outputs: + raise ValueError(f"Run {run.id} has no outputs to evaluate.") + return {"input": run.inputs["input"], "prediction": run.outputs["output"]} + + +class StringExampleMapper(Serializable): + """Map an example, or row in the dataset, to the inputs of an evaluation.""" + + reference_key: Optional[str] = None + + @property + def output_keys(self) -> List[str]: + """The keys to extract from the run.""" + return ["reference"] + + def serialize_chat_messages(self, messages: List[Dict]) -> str: + """Extract the input messages from the run.""" + chat_messages = _get_messages_from_run_dict(messages) + return get_buffer_string(chat_messages) + + def map(self, example: Example) -> Dict[str, str]: + """Maps the Example, or dataset row to a dictionary.""" + if not example.outputs: + raise ValueError( + f"Example {example.id} has no outputs to use as a reference." + ) + if self.reference_key is None: + if len(example.outputs) > 1: + raise ValueError( + f"Example {example.id} has multiple outputs, so you must" + " specify a reference_key." + ) + else: + output = list(example.outputs.values())[0] + return { + "reference": self.serialize_chat_messages([output]) + if isinstance(output, dict) + and output.get("type") + and output.get("data") + else output + } + elif self.reference_key not in example.outputs: + raise ValueError( + f"Example {example.id} does not have reference key" + f" {self.reference_key}." + ) + return {"reference": example.outputs[self.reference_key]} + + def __call__(self, example: Example) -> Dict[str, str]: + """Maps the Run and Example to a dictionary.""" + if not example.outputs: + raise ValueError( + f"Example {example.id} has no outputs to use as areference label." + ) + return self.map(example) + + +class StringRunEvaluatorChain(Chain, RunEvaluator): + """Evaluate Run and optional examples.""" + + run_mapper: StringRunMapper + """Maps the Run to a dictionary with 'input' and 'prediction' strings.""" + example_mapper: Optional[StringExampleMapper] = None + """Maps the Example (dataset row) to a dictionary + with a 'reference' string.""" + name: str + """The name of the evaluation metric.""" + string_evaluator: StringEvaluator + """The evaluation chain.""" + + @property + def input_keys(self) -> List[str]: + return ["run", "example"] + + @property + def output_keys(self) -> List[str]: + return ["feedback"] + + def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]: + run: Run = inputs["run"] + example: Optional[Example] = inputs.get("example") + evaluate_strings_inputs = self.run_mapper(run) + if example and self.example_mapper: + evaluate_strings_inputs.update(self.example_mapper(example)) + elif self.string_evaluator.requires_reference: + raise ValueError( + f"Evaluator {self.name} requires an reference" + " example from the dataset," + f" but none was provided for run {run.id}." + ) + return evaluate_strings_inputs + + def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult: + evaluation_result = EvaluationResult(key=self.name, **output) + if RUN_KEY in output: + # TODO: Not currently surfaced. Update + evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY] + return evaluation_result + + def _call( + self, + inputs: Dict[str, str], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + """Call the evaluation chain.""" + evaluate_strings_inputs = self._prepare_input(inputs) + _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() + callbacks = _run_manager.get_child() + chain_output = self.string_evaluator.evaluate_strings( + **evaluate_strings_inputs, + callbacks=callbacks, + ) + evaluation_result = self._prepare_output(chain_output) + return {"feedback": evaluation_result} + + async def _acall( + self, + inputs: Dict[str, str], + run_manager: AsyncCallbackManagerForChainRun | None = None, + ) -> Dict[str, Any]: + """Call the evaluation chain.""" + evaluate_strings_inputs = self._prepare_input(inputs) + _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() + callbacks = _run_manager.get_child() + chain_output = await self.string_evaluator.aevaluate_strings( + **evaluate_strings_inputs, + callbacks=callbacks, + ) + evaluation_result = self._prepare_output(chain_output) + return {"feedback": evaluation_result} + + def evaluate_run( + self, run: Run, example: Optional[Example] = None + ) -> EvaluationResult: + """Evaluate an example.""" + return self({"run": run, "example": example})["feedback"] + + async def aevaluate_run( + self, run: Run, example: Optional[Example] = None + ) -> EvaluationResult: + """Evaluate an example.""" + result = await self.acall({"run": run, "example": example}) + return result["feedback"] + + @classmethod + def from_model_and_evaluator( + cls, + model: Union[Chain, BaseLanguageModel, Tool], + evaluator: StringEvaluator, + input_key: Optional[str] = None, + prediction_key: Optional[str] = None, + reference_key: Optional[str] = None, + ) -> StringRunEvaluatorChain: + """Create a StringRunEvaluatorChain from a model and evaluator.""" + if isinstance(model, BaseLanguageModel): + run_mapper: StringRunMapper = LLMStringRunMapper() + elif isinstance(model, Chain): + run_mapper = ChainStringRunMapper.from_chain( + model, input_key=input_key, prediction_key=prediction_key + ) + elif isinstance(model, Tool): + run_mapper = ToolStringRunMapper() + else: + raise NotImplementedError( + f"{cls.__name__}.from_model_and_evaluator({type(model)})" + " not yet implemented." + "Expected one of [BaseLanguageModel, Chain, Tool]." + ) + if reference_key is not None or isinstance(model, BaseLanguageModel): + example_mapper = StringExampleMapper(reference_key=reference_key) + elif evaluator.requires_reference: + # We could potentially auto-infer if there is only one string in the + # example, but it's preferred to raise earlier. + raise ValueError( + f"Evaluator {evaluator.evaluation_name} requires a reference" + " example from the dataset. Please specify the reference key from" + " amongst the dataset outputs keys." + ) + else: + example_mapper = None + return cls( + name=evaluator.evaluation_name, + run_mapper=run_mapper, + example_mapper=example_mapper, + string_evaluator=evaluator, + ) diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py index 1f76aeecfb..a1a1951ba2 100644 --- a/langchain/evaluation/schema.py +++ b/langchain/evaluation/schema.py @@ -91,6 +91,14 @@ class _EvalArgsMixin: class StringEvaluator(_EvalArgsMixin, ABC): """Protocol for evaluating strings.""" + @property + def evaluation_name(self) -> str: + raise NotImplementedError() + + @property + def requires_reference(self) -> bool: + return False + @abstractmethod def _evaluate_strings( self, @@ -110,6 +118,10 @@ class StringEvaluator(_EvalArgsMixin, ABC): **kwargs: additional keyword arguments, including callbacks, tags, etc. Returns: dict: The evaluation results containing the score or value. + It is recommended that the dictionary contain the following keys: + - score: the score of the evaluation, if applicable. + - value: the string value of the evaluation, if applicable. + - reasoning: the reasoning for the evaluation, if applicable. """ async def _aevaluate_strings( @@ -131,6 +143,10 @@ class StringEvaluator(_EvalArgsMixin, ABC): **kwargs: additional keyword arguments, including callbacks, tags, etc. Returns: dict: The evaluation results containing the score or value. + It is recommended that the dictionary contain the following keys: + - score: the score of the evaluation, if applicable. + - value: the string value of the evaluation, if applicable. + - reasoning: the reasoning for the evaluation, if applicable. """ raise NotImplementedError( f"{self.__class__.__name__} hasn't implemented an " diff --git a/langchain/schema/messages.py b/langchain/schema/messages.py index c03ae20358..a0cbf978d2 100644 --- a/langchain/schema/messages.py +++ b/langchain/schema/messages.py @@ -168,7 +168,7 @@ def _message_from_dict(message: dict) -> BaseMessage: elif _type == "chat": return ChatMessage(**message["data"]) else: - raise ValueError(f"Got unexpected type: {_type}") + raise ValueError(f"Got unexpected message type: {_type}") def messages_from_dict(messages: List[dict]) -> List[BaseMessage]: diff --git a/tests/integration_tests/client/__init__.py b/tests/integration_tests/client/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration_tests/client/test_runner_utils.py b/tests/integration_tests/client/test_runner_utils.py new file mode 100644 index 0000000000..d8f6318527 --- /dev/null +++ b/tests/integration_tests/client/test_runner_utils.py @@ -0,0 +1,81 @@ +import sys +from typing import Iterator +from uuid import uuid4 + +import pytest +from langchainplus_sdk import LangChainPlusClient as Client + +from langchain.chains.llm import LLMChain +from langchain.chat_models import ChatOpenAI +from langchain.client.runner_utils import run_on_dataset +from langchain.evaluation import EvaluatorType +from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model +from langchain.llms.openai import OpenAI + + +@pytest.fixture( + scope="module", +) +def dataset_name() -> Iterator[str]: + import pandas as pd + + client = Client() + df = pd.DataFrame( + [ + {"question": "5", "answer": 5.0}, + {"question": "5 + 3", "answer": 8.0}, + {"question": "2^3.171", "answer": 9.006708689094099}, + {"question": " 2 ^3.171 ", "answer": 9.006708689094099}, + ] + ) + + uid = str(uuid4())[-8:] + _dataset_name = f"lcp integration tests - {uid}" + client.upload_dataframe( + df, + name=_dataset_name, + input_keys=["question"], + output_keys=["answer"], + description="Integration test dataset", + ) + yield _dataset_name + + +def test_chat_model(dataset_name: str) -> None: + llm = ChatOpenAI(temperature=0) + evaluators = load_run_evaluators_for_model( + [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer" + ) + results = run_on_dataset( + dataset_name, + llm, + run_evaluators=evaluators, + ) + print("CHAT", results, file=sys.stderr) + + +def test_llm(dataset_name: str) -> None: + llm = OpenAI(temperature=0) + evaluators = load_run_evaluators_for_model( + [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer" + ) + results = run_on_dataset( + dataset_name, + llm, + run_evaluators=evaluators, + ) + print("LLM", results, file=sys.stderr) + + +def test_chain(dataset_name: str) -> None: + llm = ChatOpenAI(temperature=0) + chain = LLMChain.from_string(llm, "The answer to the {question} is: ") + evaluators = load_run_evaluators_for_model( + [EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer" + ) + results = run_on_dataset( + dataset_name, + lambda: chain, + run_evaluators=evaluators, + ) + print("CHAIN", results, file=sys.stderr) diff --git a/tests/unit_tests/evaluation/run_evaluators/test_loading.py b/tests/unit_tests/evaluation/run_evaluators/test_loading.py new file mode 100644 index 0000000000..b318521599 --- /dev/null +++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py @@ -0,0 +1,114 @@ +"""Test the loading function for evalutors.""" + +from unittest.mock import MagicMock + +import pytest + +from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler +from langchain.evaluation.loading import load_evaluators +from langchain.evaluation.run_evaluators.string_run_evaluator import ( + StringRunEvaluatorChain, +) +from langchain.evaluation.schema import StringEvaluator +from tests.unit_tests.chains.test_base import FakeChain +from tests.unit_tests.llms.fake_chat_model import FakeChatModel +from tests.unit_tests.llms.fake_llm import FakeLLM + + +@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"]) +def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None: + """Test loading evaluators.""" + fake_llm = FakeLLM( + queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True + ) + evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore + if not isinstance(evaluator, StringEvaluator): + raise ValueError("Evaluator is not a string evaluator") + model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True) + kwargs = {} + if evaluator.requires_reference: + kwargs["reference_key"] = "generations" + run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator( + model, evaluator, **kwargs + ) + callback = RunCollectorCallbackHandler() + model.predict("Foo input", callbacks=[callback]) + run = callback.traced_runs[0] + example = MagicMock() + example.inputs = {} + example.outputs = {"generations": "Foo output"} + result = run_evaluator._prepare_input({"run": run, "example": example}) + assert result["input"] == "Foo input" + assert result["prediction"] == "Foo output" + if evaluator.requires_reference: + assert "reference" in result + assert result["reference"] == "Foo output" + + +@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"]) +def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None: + """Test loading evaluators.""" + fake_llm = FakeLLM( + queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True + ) + evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore + if not isinstance(evaluator, StringEvaluator): + raise ValueError("Evaluator is not a string evaluator") + model = FakeChatModel() + kwargs = {} + if evaluator.requires_reference: + kwargs["reference_key"] = "generations" + run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator( + model, evaluator, **kwargs + ) + callback = RunCollectorCallbackHandler() + model.predict("Foo input", callbacks=[callback]) + run = callback.traced_runs[0] + example = MagicMock() + example.inputs = {} + example.outputs = {"generations": "Another fake response"} + result = run_evaluator._prepare_input({"run": run, "example": example}) + assert result["input"] == "Human: Foo input" + assert result["prediction"] == "AI: fake response" + if evaluator.requires_reference: + assert "reference" in result + assert result["reference"] == "Another fake response" + + +@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"]) +def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None: + model = FakeChain( + the_input_keys=["an_input", "another_input"], + ) + fake_llm = FakeChatModel() + evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0] # type: ignore + if not isinstance(evaluator, StringEvaluator): + raise ValueError("Evaluator is not a string evaluator") + # No input key + with pytest.raises(ValueError, match="multiple input keys"): + StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator) + with pytest.raises(ValueError, match="does not have specified"): + StringRunEvaluatorChain.from_model_and_evaluator( + model, evaluator, input_key="some_input" + ) + kwargs = {} + if evaluator.requires_reference: + kwargs["reference_key"] = "label_column" + run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator( + model, evaluator, input_key="an_input", **kwargs + ) + callback = RunCollectorCallbackHandler() + model( + {"an_input": "Foo input", "another_input": "Another fake response"}, + callbacks=[callback], + ) + run = callback.traced_runs[0] + example = MagicMock() + example.inputs = {} + example.outputs = {"label_column": "Another fake response"} + result = run_evaluator._prepare_input({"run": run, "example": example}) + assert result["input"] == "Foo input" + assert result["prediction"] == "baz" + if evaluator.requires_reference: + assert "reference" in result + assert result["reference"] == "Another fake response" diff --git a/tests/unit_tests/evaluation/test_loading.py b/tests/unit_tests/evaluation/test_loading.py index 27c538d8b3..e707246fb0 100644 --- a/tests/unit_tests/evaluation/test_loading.py +++ b/tests/unit_tests/evaluation/test_loading.py @@ -3,7 +3,9 @@ import pytest from langchain.evaluation.loading import EvaluatorType, load_evaluators +from langchain.evaluation.schema import StringEvaluator from tests.unit_tests.llms.fake_chat_model import FakeChatModel +from tests.unit_tests.llms.fake_llm import FakeLLM @pytest.mark.parametrize("evaluator_type", EvaluatorType) @@ -14,3 +16,16 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None: # Test as string load_evaluators([evaluator_type.value], llm=fake_llm) # type: ignore + + +def test_criteria_eval_chain_requires_reference() -> None: + """Test loading evaluators.""" + fake_llm = FakeLLM( + queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True + ) + evaluator = load_evaluators( + [EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True + )[0] + if not isinstance(evaluator, StringEvaluator): + raise ValueError("Evaluator is not a string evaluator") + assert evaluator.requires_reference