Load Run Evaluator (#7101)

Current problems: 1. Evaluating LLMs or Chat models isn't smooth. Even specifying 'generations' as the output inserts a redundant list into the eval template 2. Configuring input / prediction / reference keys in the `get_qa_evaluator` function is confusing. Unless you are using a chain with the default keys, you have to specify all the variables and need to reason about whether the key corresponds to the traced run's inputs, outputs or the examples inputs or outputs. Proposal: - Configure the run evaluator according to a model. Use the model type and input/output keys to assert compatibility where possible. Only need to specify a reference_key for certain evaluators (which is less confusing than specifying input keys) When does this work: - If you have your langchain model available (assumed always for run_on_dataset flow) - If you are evaluating an LLM, Chat model, or chain - If the LLM or chat models are traced by langchain (wouldn't work if you add an incompatible schema via the REST API) When would this fail: - Currently if you directly create an example from an LLM run, the outputs are generations with all the extra metadata present. A simple `example_key` and dumping all to the template could make the evaluations unreliable - Doesn't help if you're not using the low level API - If you want to instantiate the evaluator without instantiating your chain or LLM (maybe common for monitoring, for instance) -> could also load from run or run type though What's ugly: - Personally think it's better to load evaluators one by one since passing a config down is pretty confusing. - Lots of testing needs to be added - Inconsistent in that it makes a separate run and example input mapper instead of the original `RunEvaluatorInputMapper`, which maps a run and example to a single input. Example usage running the for an LLM, Chat Model, and Agent. ``` # Test running for the string evaluators evaluator_names = ["qa", "criteria"] model = ChatOpenAI() configured_evaluators = load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer") run_on_dataset(ds_name, model, run_evaluators=configured_evaluators) ``` <details> <summary>Full code with dataset upload</summary> ``` ## Create dataset from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model from langchain.evaluation import load_dataset import pandas as pd lcds = load_dataset("llm-math") df = pd.DataFrame(lcds) from uuid import uuid4 from langsmith import Client client = Client() ds_name = "llm-math - " + str(uuid4())[0:8] ds = client.upload_dataframe(df, name=ds_name, input_keys=["question"], output_keys=["answer"]) ## Define the models we'll test over from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.agents import initialize_agent, AgentType from langchain.tools import tool llm = OpenAI(temperature=0) chat_model = ChatOpenAI(temperature=0) @tool def sum(a: float, b: float) -> float: """Add two numbers""" return a + b def construct_agent(): return initialize_agent( llm=chat_model, tools=[sum], agent=AgentType.OPENAI_MULTI_FUNCTIONS, ) agent = construct_agent() # Test running for the string evaluators evaluator_names = ["qa", "criteria"] models = [llm, chat_model, agent] run_evaluators = [] for model in models: run_evaluators.append(load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer")) # Run on LLM, Chat Model, and Agent from langchain.client.runner_utils import run_on_dataset to_test = [llm, chat_model, construct_agent] for model, configured_evaluators in zip(to_test, run_evaluators): run_on_dataset(ds_name, model, run_evaluators=configured_evaluators, verbose=True) ``` </details> --------- Co-authored-by: Nuno Campos <nuno@boringbits.io>
2024-11-06 03:20:49 +00:00 · 2023-07-07 19:57:59 -07:00 · 2023-07-07 19:57:59 -07:00 · c5edbea34a
commit c5edbea34a
parent 1ac347b4e3
13 changed files with 730 additions and 3 deletions
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@ -99,6 +99,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

    output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
    """The parser to use to map the output to a structured result."""
+    criteria_names: List[str] = Field(default_factory=list)
+    """The names of the criteria being evaluated."""

    class Config:
        """Configuration for the QAEvalChain."""
@ -107,12 +109,24 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):

    @property
    def requires_reference(self) -> bool:
+        """Whether the evaluation requires a reference text."""
        return "reference" in self.prompt.input_variables

    @property
    def requires_input(self) -> bool:
        return True

+    @property
+    def evaluation_name(self) -> str:
+        """Get the name of the evaluation.
+
+        Returns
+        -------
+        str
+            The name of the evaluation.
+        """
+        return " ".join(self.criteria_names)
+
    @property
    def _skip_reference_warning(self) -> str:
        """Warning to show when reference is ignored."""
@ -266,9 +280,15 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
            )

        criteria_ = cls.resolve_criteria(criteria)
+        criteria_names = list(criteria_.keys())
        criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
        prompt_ = prompt.partial(criteria=criteria_str)
-        return cls(llm=llm, prompt=prompt_, **kwargs)
+        return cls(
+            llm=llm,
+            prompt=prompt_,
+            criteria_names=criteria_names,
+            **kwargs,
+        )

    def _get_eval_input(
        self,
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@ -67,9 +67,14 @@ def load_evaluator(
    Examples
    --------
    >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
-    >>> evaluator = load_evaluator(EvaluatorType.QA, llm=llm)
+    >>> evaluator = _load_evaluator("qa", llm=llm)
    """
    llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
+    if evaluator not in _EVALUATOR_MAP:
+        raise ValueError(
+            f"Unknown evaluator type: {evaluator}"
+            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
+        )
    return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)


--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@ -49,6 +49,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):

        extra = Extra.ignore

+    @property
+    def evaluation_name(self) -> str:
+        return "correctness"
+
    @property
    def requires_reference(self) -> bool:
        return True
@ -155,10 +159,12 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):

    @property
    def requires_reference(self) -> bool:
+        """Whether the chain requires a reference string."""
        return True

    @property
    def requires_input(self) -> bool:
+        """Whether the chain requires an input string."""
        return True

    @classmethod
@ -170,6 +176,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
                f"but got {prompt.input_variables}"
            )

+    @property
+    def evaluation_name(self) -> str:
+        return "Contextual Accuracy"
+
    @classmethod
    def from_llm(
        cls,
@ -250,6 +260,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
 class CotQAEvalChain(ContextQAEvalChain):
    """LLM Chain specifically for evaluating QA using chain of thought reasoning."""

+    @property
+    def evaluation_name(self) -> str:
+        return "COT Contextual Accuracy"
+
    @classmethod
    def from_llm(
        cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any
--- a/langchain/evaluation/run_evaluators/init.py
+++ b/langchain/evaluation/run_evaluators/init.py
@ -11,6 +11,9 @@ from langchain.evaluation.run_evaluators.implementations import (
    get_qa_evaluator,
    get_trajectory_evaluator,
 )
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)

 __all__ = [
    "RunEvaluatorChain",
@ -21,4 +24,5 @@ __all__ = [
    "get_trajectory_evaluator",
    "StringRunEvaluatorInputMapper",
    "ChoicesOutputParser",
+    "StringRunEvaluatorChain",
 ]
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@ -21,6 +21,10 @@ class RunEvaluatorInputMapper:
    def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
        """Maps the Run and Optional[Example] to a dictionary"""

+    def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
+        """Maps the Run and Optional[Example] to a dictionary"""
+        return self.map(run, example)
+

 class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
    """Parse the output of a run."""
--- a/langchain/evaluation/run_evaluators/loading.py
+++ b/langchain/evaluation/run_evaluators/loading.py
@ -0,0 +1,69 @@
+""""Loading helpers for run evaluators."""
+
+
+from typing import Any, List, Optional, Sequence, Union
+
+from langchainplus_sdk import RunEvaluator
+
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.base import Chain
+from langchain.evaluation.loading import load_evaluators
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
+from langchain.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain.tools.base import Tool
+
+
+def load_run_evaluators_for_model(
+    evaluators: Sequence[EvaluatorType],
+    model: Union[Chain, BaseLanguageModel, Tool],
+    *,
+    input_key: Optional[str] = None,
+    prediction_key: Optional[str] = None,
+    reference_key: Optional[str] = None,
+    eval_llm: Optional[BaseLanguageModel] = None,
+    **kwargs: Any,
+) -> List[RunEvaluator]:
+    """Load evaluators specified by a list of evaluator types.
+
+    Parameters
+    ----------
+    evaluators : Sequence[EvaluatorType]
+        The list of evaluator types to load.
+    model : Union[Chain, BaseLanguageModel, Tool]
+        The model to evaluate. Used to infer how to parse the run.
+    input_key : Optional[str], a chain run's input key to map
+        to the evaluator's input
+    prediction_key : Optional[str], the key in the run's outputs to
+        represent the Chain prediction
+    reference_key : Optional[str], the key in the dataset example (row)
+        outputs to represent the reference, or ground-truth label
+    eval_llm : BaseLanguageModel, optional
+        The language model to use for evaluation, if none is provided, a default
+        ChatOpenAI gpt-4 model will be used.
+    **kwargs : Any
+        Additional keyword arguments to pass to all evaluators.
+
+    Returns
+    -------
+    List[RunEvaluator]
+        The loaded Run evaluators.
+    """
+    evaluators_ = load_evaluators(evaluators, llm=eval_llm, **kwargs)
+    run_evaluators = []
+    for evaluator in evaluators_:
+        if isinstance(evaluator, StringEvaluator):
+            run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+                model,
+                evaluator,
+                input_key=input_key,
+                prediction_key=prediction_key,
+                reference_key=reference_key,
+            )
+        else:
+            raise NotImplementedError(
+                f"Run evaluator for {evaluator} is not implemented"
+            )
+        run_evaluators.append(run_evaluator)
+    return run_evaluators
--- a/langchain/evaluation/run_evaluators/string_run_evaluator.py
+++ b/langchain/evaluation/run_evaluators/string_run_evaluator.py
@ -0,0 +1,385 @@
+"""Run evaluator wrapper for string evaluators."""
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Union
+
+from langchainplus_sdk import EvaluationResult, RunEvaluator
+from langchainplus_sdk.schemas import Example, Run
+
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.schema import StringEvaluator
+from langchain.load.dump import dumps
+from langchain.load.load import loads
+from langchain.load.serializable import Serializable
+from langchain.schema import RUN_KEY, messages_from_dict
+from langchain.schema.messages import BaseMessage, get_buffer_string
+from langchain.tools.base import Tool
+
+
+def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
+    if not messages:
+        return []
+    first_message = messages[0]
+    if "lc" in first_message:
+        return [loads(dumps(message)) for message in messages]
+    else:
+        return messages_from_dict(messages)
+
+
+class StringRunMapper(Serializable):
+    """Extract items to evaluate from the run object."""
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["prediction", "input"]
+
+    @abstractmethod
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+
+    def __call__(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        return self.map(run)
+
+
+class LLMStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object."""
+
+    def serialize_chat_messages(self, messages: List[Dict]) -> str:
+        """Extract the input messages from the run."""
+        if isinstance(messages, list) and messages:
+            if isinstance(messages[0], dict):
+                chat_messages = _get_messages_from_run_dict(messages)
+            elif isinstance(messages[0], list):
+                # Runs from Tracer have messages as a list of lists of dicts
+                chat_messages = _get_messages_from_run_dict(messages[0])
+            else:
+                raise ValueError(f"Could not extract messages to evaluate {messages}")
+            return get_buffer_string(chat_messages)
+        raise ValueError(f"Could not extract messages to evaluate {messages}")
+
+    def serialize_inputs(self, inputs: Dict) -> str:
+        if "prompts" in inputs:  # Should we even accept this?
+            input_ = "\n\n".join(inputs["prompts"])
+        elif "prompt" in inputs:
+            input_ = inputs["prompt"]
+        elif "messages" in inputs:
+            input_ = self.serialize_chat_messages(inputs["messages"])
+        else:
+            raise ValueError("LLM Run must have either messages or prompts as inputs.")
+        return input_
+
+    def serialize_outputs(self, outputs: Dict) -> str:
+        if not outputs.get("generations"):
+            raise ValueError("Cannot evaluate LLM Run without generations.")
+        generations: List[Dict] = outputs["generations"]
+        if not generations:
+            raise ValueError("Cannot evaluate LLM run with empty generations.")
+        first_generation: Dict = generations[0]
+        if isinstance(first_generation, list):
+            # Runs from Tracer have generations as a list of lists of dicts
+            # Whereas Runs from the API have a list of dicts
+            first_generation = first_generation[0]
+        if "message" in first_generation:
+            output_ = self.serialize_chat_messages([first_generation["message"]])
+        else:
+            output_ = first_generation["text"]
+        return output_
+
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if run.run_type != "llm":
+            raise ValueError("LLM RunMapper only supports LLM runs.")
+        elif not run.outputs:
+            if run.error:
+                raise ValueError(
+                    f"Cannot evaluate errored LLM run {run.id}: {run.error}"
+                )
+            else:
+                raise ValueError(
+                    f"Run {run.id} has no outputs. Cannot evaluate this run."
+                )
+        else:
+            try:
+                inputs = self.serialize_inputs(run.inputs)
+            except Exception as e:
+                raise ValueError(
+                    f"Could not parse LM input from run inputs {run.inputs}"
+                ) from e
+            try:
+                output_ = self.serialize_outputs(run.outputs)
+            except Exception as e:
+                raise ValueError(
+                    f"Could not parse LM prediction from run outputs {run.outputs}"
+                ) from e
+            return {"input": inputs, "prediction": output_}
+
+
+class ChainStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object from a chain."""
+
+    input_key: str
+    """The key from the model Run's inputs to use as the eval input."""
+    prediction_key: str
+    """The key from the model Run's outputs to use as the eval prediction."""
+
+    @classmethod
+    def from_chain(
+        cls,
+        model: Chain,
+        input_key: Optional[str] = None,
+        prediction_key: Optional[str] = None,
+    ) -> ChainStringRunMapper:
+        """Create a RunMapper from a chain."""
+        error_messages = []
+        if input_key is None:
+            if len(model.input_keys) > 1:
+                error_messages.append(
+                    f"Chain {model.lc_namespace} has multiple input"
+                    " keys. Please specify 'input_key' when loading."
+                )
+            else:
+                input_key = model.input_keys[0]
+        elif input_key not in model.input_keys:
+            error_messages.append(
+                f"Chain {model.lc_namespace} does not have specified"
+                f" input key {input_key}."
+            )
+        if prediction_key is None:
+            if len(model.output_keys) > 1:
+                error_messages.append(
+                    f"Chain {model.lc_namespace} has multiple"
+                    " output keys. Please specify 'prediction_key' when loading."
+                )
+            else:
+                prediction_key = model.output_keys[0]
+        elif prediction_key not in model.output_keys:
+            error_messages.append(
+                f"Chain {model.lc_namespace} does not have specified"
+                f" prediction_key {prediction_key}."
+            )
+        if error_messages:
+            raise ValueError("\n".join(error_messages))
+        if input_key is None or prediction_key is None:
+            # This should never happen, but mypy doesn't know that.
+            raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
+        return cls(input_key=input_key, prediction_key=prediction_key)
+
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        if run.run_type != "chain":
+            raise ValueError("Chain RunMapper only supports Chain runs.")
+        if self.input_key not in run.inputs:
+            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
+        elif self.prediction_key not in run.outputs:
+            raise ValueError(
+                f"Run {run.id} does not have prediction key {self.prediction_key}."
+            )
+        else:
+            return {
+                "input": run.inputs[self.input_key],
+                "prediction": run.outputs[self.prediction_key],
+            }
+
+
+class ToolStringRunMapper(StringRunMapper):
+    """Map an input to the tool."""
+
+    def map(self, run: Run) -> Dict[str, str]:
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
+
+
+class StringExampleMapper(Serializable):
+    """Map an example, or row in the dataset, to the inputs of an evaluation."""
+
+    reference_key: Optional[str] = None
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["reference"]
+
+    def serialize_chat_messages(self, messages: List[Dict]) -> str:
+        """Extract the input messages from the run."""
+        chat_messages = _get_messages_from_run_dict(messages)
+        return get_buffer_string(chat_messages)
+
+    def map(self, example: Example) -> Dict[str, str]:
+        """Maps the Example, or dataset row to a dictionary."""
+        if not example.outputs:
+            raise ValueError(
+                f"Example {example.id} has no outputs to use as a reference."
+            )
+        if self.reference_key is None:
+            if len(example.outputs) > 1:
+                raise ValueError(
+                    f"Example {example.id} has multiple outputs, so you must"
+                    " specify a reference_key."
+                )
+            else:
+                output = list(example.outputs.values())[0]
+                return {
+                    "reference": self.serialize_chat_messages([output])
+                    if isinstance(output, dict)
+                    and output.get("type")
+                    and output.get("data")
+                    else output
+                }
+        elif self.reference_key not in example.outputs:
+            raise ValueError(
+                f"Example {example.id} does not have reference key"
+                f" {self.reference_key}."
+            )
+        return {"reference": example.outputs[self.reference_key]}
+
+    def __call__(self, example: Example) -> Dict[str, str]:
+        """Maps the Run and Example to a dictionary."""
+        if not example.outputs:
+            raise ValueError(
+                f"Example {example.id} has no outputs to use as areference label."
+            )
+        return self.map(example)
+
+
+class StringRunEvaluatorChain(Chain, RunEvaluator):
+    """Evaluate Run and optional examples."""
+
+    run_mapper: StringRunMapper
+    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""
+    example_mapper: Optional[StringExampleMapper] = None
+    """Maps the Example (dataset row) to a dictionary
+    with a 'reference' string."""
+    name: str
+    """The name of the evaluation metric."""
+    string_evaluator: StringEvaluator
+    """The evaluation chain."""
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["run", "example"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["feedback"]
+
+    def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        run: Run = inputs["run"]
+        example: Optional[Example] = inputs.get("example")
+        evaluate_strings_inputs = self.run_mapper(run)
+        if example and self.example_mapper:
+            evaluate_strings_inputs.update(self.example_mapper(example))
+        elif self.string_evaluator.requires_reference:
+            raise ValueError(
+                f"Evaluator {self.name} requires an reference"
+                " example from the dataset,"
+                f" but none was provided for run {run.id}."
+            )
+        return evaluate_strings_inputs
+
+    def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        evaluation_result = EvaluationResult(key=self.name, **output)
+        if RUN_KEY in output:
+            # TODO: Not currently surfaced. Update
+            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return evaluation_result
+
+    def _call(
+        self,
+        inputs: Dict[str, str],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = self.string_evaluator.evaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, str],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = await self.string_evaluator.aevaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        return self({"run": run, "example": example})["feedback"]
+
+    async def aevaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        result = await self.acall({"run": run, "example": example})
+        return result["feedback"]
+
+    @classmethod
+    def from_model_and_evaluator(
+        cls,
+        model: Union[Chain, BaseLanguageModel, Tool],
+        evaluator: StringEvaluator,
+        input_key: Optional[str] = None,
+        prediction_key: Optional[str] = None,
+        reference_key: Optional[str] = None,
+    ) -> StringRunEvaluatorChain:
+        """Create a StringRunEvaluatorChain from a model and evaluator."""
+        if isinstance(model, BaseLanguageModel):
+            run_mapper: StringRunMapper = LLMStringRunMapper()
+        elif isinstance(model, Chain):
+            run_mapper = ChainStringRunMapper.from_chain(
+                model, input_key=input_key, prediction_key=prediction_key
+            )
+        elif isinstance(model, Tool):
+            run_mapper = ToolStringRunMapper()
+        else:
+            raise NotImplementedError(
+                f"{cls.__name__}.from_model_and_evaluator({type(model)})"
+                " not yet implemented."
+                "Expected one of [BaseLanguageModel, Chain, Tool]."
+            )
+        if reference_key is not None or isinstance(model, BaseLanguageModel):
+            example_mapper = StringExampleMapper(reference_key=reference_key)
+        elif evaluator.requires_reference:
+            # We could potentially auto-infer if there is only one string in the
+            # example, but it's preferred to raise earlier.
+            raise ValueError(
+                f"Evaluator {evaluator.evaluation_name} requires a reference"
+                " example from the dataset. Please specify the reference key from"
+                " amongst the dataset outputs keys."
+            )
+        else:
+            example_mapper = None
+        return cls(
+            name=evaluator.evaluation_name,
+            run_mapper=run_mapper,
+            example_mapper=example_mapper,
+            string_evaluator=evaluator,
+        )
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@ -91,6 +91,14 @@ class _EvalArgsMixin:
 class StringEvaluator(_EvalArgsMixin, ABC):
    """Protocol for evaluating strings."""

+    @property
+    def evaluation_name(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def requires_reference(self) -> bool:
+        return False
+
    @abstractmethod
    def _evaluate_strings(
        self,
@ -110,6 +118,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                    - score: the score of the evaluation, if applicable.
+                    - value: the string value of the evaluation, if applicable.
+                    - reasoning: the reasoning for the evaluation, if applicable.
        """

    async def _aevaluate_strings(
@ -131,6 +143,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                    - score: the score of the evaluation, if applicable.
+                    - value: the string value of the evaluation, if applicable.
+                    - reasoning: the reasoning for the evaluation, if applicable.
        """
        raise NotImplementedError(
            f"{self.__class__.__name__} hasn't implemented an "
--- a/langchain/schema/messages.py
+++ b/langchain/schema/messages.py
@ -168,7 +168,7 @@ def _message_from_dict(message: dict) -> BaseMessage:
    elif _type == "chat":
        return ChatMessage(**message["data"])
    else:
-        raise ValueError(f"Got unexpected type: {_type}")
+        raise ValueError(f"Got unexpected message type: {_type}")


 def messages_from_dict(messages: List[dict]) -> List[BaseMessage]:
--- a/tests/integration_tests/client/init.py
+++ b/tests/integration_tests/client/init.py
--- a/tests/integration_tests/client/test_runner_utils.py
+++ b/tests/integration_tests/client/test_runner_utils.py
@ -0,0 +1,81 @@
+import sys
+from typing import Iterator
+from uuid import uuid4
+
+import pytest
+from langchainplus_sdk import LangChainPlusClient as Client
+
+from langchain.chains.llm import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.client.runner_utils import run_on_dataset
+from langchain.evaluation import EvaluatorType
+from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
+from langchain.llms.openai import OpenAI
+
+
+@pytest.fixture(
+    scope="module",
+)
+def dataset_name() -> Iterator[str]:
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        [
+            {"question": "5", "answer": 5.0},
+            {"question": "5 + 3", "answer": 8.0},
+            {"question": "2^3.171", "answer": 9.006708689094099},
+            {"question": "  2 ^3.171 ", "answer": 9.006708689094099},
+        ]
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp integration tests - {uid}"
+    client.upload_dataframe(
+        df,
+        name=_dataset_name,
+        input_keys=["question"],
+        output_keys=["answer"],
+        description="Integration test dataset",
+    )
+    yield _dataset_name
+
+
+def test_chat_model(dataset_name: str) -> None:
+    llm = ChatOpenAI(temperature=0)
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        llm,
+        run_evaluators=evaluators,
+    )
+    print("CHAT", results, file=sys.stderr)
+
+
+def test_llm(dataset_name: str) -> None:
+    llm = OpenAI(temperature=0)
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        llm,
+        run_evaluators=evaluators,
+    )
+    print("LLM", results, file=sys.stderr)
+
+
+def test_chain(dataset_name: str) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        lambda: chain,
+        run_evaluators=evaluators,
+    )
+    print("CHAIN", results, file=sys.stderr)
--- a/tests/unit_tests/evaluation/run_evaluators/test_loading.py
+++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
@ -0,0 +1,114 @@
+"""Test the loading function for evalutors."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
+from langchain.evaluation.loading import load_evaluators
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
+from langchain.evaluation.schema import StringEvaluator
+from tests.unit_tests.chains.test_base import FakeChain
+from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "generations"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model.predict("Foo input", callbacks=[callback])
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"generations": "Foo output"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Foo input"
+    assert result["prediction"] == "Foo output"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Foo output"
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    model = FakeChatModel()
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "generations"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model.predict("Foo input", callbacks=[callback])
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"generations": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Human: Foo input"
+    assert result["prediction"] == "AI: fake response"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
+    model = FakeChain(
+        the_input_keys=["an_input", "another_input"],
+    )
+    fake_llm = FakeChatModel()
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    # No input key
+    with pytest.raises(ValueError, match="multiple input keys"):
+        StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
+    with pytest.raises(ValueError, match="does not have specified"):
+        StringRunEvaluatorChain.from_model_and_evaluator(
+            model, evaluator, input_key="some_input"
+        )
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "label_column"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, input_key="an_input", **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model(
+        {"an_input": "Foo input", "another_input": "Another fake response"},
+        callbacks=[callback],
+    )
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"label_column": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Foo input"
+    assert result["prediction"] == "baz"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
--- a/tests/unit_tests/evaluation/test_loading.py
+++ b/tests/unit_tests/evaluation/test_loading.py
@ -3,7 +3,9 @@
 import pytest

 from langchain.evaluation.loading import EvaluatorType, load_evaluators
+from langchain.evaluation.schema import StringEvaluator
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM


@pytest.mark.parametrize("evaluator_type", EvaluatorType)
@ -14,3 +16,16 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:

    # Test as string
    load_evaluators([evaluator_type.value], llm=fake_llm)  # type: ignore
+
+
+def test_criteria_eval_chain_requires_reference() -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators(
+        [EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
+    )[0]
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    assert evaluator.requires_reference