From c5edbea34aa3e3d3ee47e00fda0ef63802dd4d7c Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Fri, 7 Jul 2023 19:57:59 -0700
Subject: [PATCH] Load Run Evaluator (#7101)

Current problems:
1. Evaluating LLMs or Chat models isn't smooth. Even specifying
'generations' as the output inserts a redundant list into the eval
template
2. Configuring input / prediction / reference keys in the
`get_qa_evaluator` function is confusing. Unless you are using a chain
with the default keys, you have to specify all the variables and need to
reason about whether the key corresponds to the traced run's inputs,
outputs or the examples inputs or outputs.


Proposal:
- Configure the run evaluator according to a model. Use the model type
and input/output keys to assert compatibility where possible. Only need
to specify a reference_key for certain evaluators (which is less
confusing than specifying input keys)


When does this work:
- If you have your langchain model available (assumed always for
run_on_dataset flow)
- If you are evaluating an LLM, Chat model, or chain
- If the LLM or chat models are traced by langchain (wouldn't work if
you add an incompatible schema via the REST API)

When would this fail:
- Currently if you directly create an example from an LLM run, the
outputs are generations with all the extra metadata present. A simple
`example_key` and dumping all to the template could make the evaluations
unreliable
- Doesn't help if you're not using the low level API
- If you want to instantiate the evaluator without instantiating your
chain or LLM (maybe common for monitoring, for instance) -> could also
load from run or run type though

What's ugly:
- Personally think it's better to load evaluators one by one since
passing a config down is pretty confusing.
- Lots of testing needs to be added
- Inconsistent in that it makes a separate run and example input mapper
instead of the original `RunEvaluatorInputMapper`, which maps a run and
example to a single input.

Example usage running the for an LLM, Chat Model, and Agent.

```
# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

model = ChatOpenAI()
configured_evaluators = load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer")
run_on_dataset(ds_name, model, run_evaluators=configured_evaluators)
```


<details>
  <summary>Full code with dataset upload</summary>
```
## Create dataset
from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
from langchain.evaluation import load_dataset
import pandas as pd

lcds = load_dataset("llm-math")
df = pd.DataFrame(lcds)

from uuid import uuid4
from langsmith import Client
client = Client()
ds_name = "llm-math - " + str(uuid4())[0:8]
ds = client.upload_dataframe(df, name=ds_name, input_keys=["question"], output_keys=["answer"])



## Define the models we'll test over
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, AgentType

from langchain.tools import tool

llm = OpenAI(temperature=0)
chat_model = ChatOpenAI(temperature=0)

@tool
    def sum(a: float, b: float) -> float:
        """Add two numbers"""
        return a + b

def construct_agent():
    return initialize_agent(
        llm=chat_model,
        tools=[sum],
        agent=AgentType.OPENAI_MULTI_FUNCTIONS,
    )

agent = construct_agent()

# Test running for the string evaluators
evaluator_names = ["qa", "criteria"]

models = [llm, chat_model, agent]
run_evaluators = []
for model in models:
    run_evaluators.append(load_run_evaluators_for_model(evaluator_names, model=model, reference_key="answer"))


# Run on LLM, Chat Model, and Agent
from langchain.client.runner_utils import run_on_dataset

to_test = [llm, chat_model, construct_agent]

for model, configured_evaluators in zip(to_test, run_evaluators):
    run_on_dataset(ds_name, model, run_evaluators=configured_evaluators, verbose=True)
```
</details>

---------

Co-authored-by: Nuno Campos <nuno@boringbits.io>
---
 langchain/evaluation/criteria/eval_chain.py   |  22 +-
 langchain/evaluation/loading.py               |   7 +-
 langchain/evaluation/qa/eval_chain.py         |  14 +
 .../evaluation/run_evaluators/__init__.py     |   4 +
 langchain/evaluation/run_evaluators/base.py   |   4 +
 .../evaluation/run_evaluators/loading.py      |  69 ++++
 .../run_evaluators/string_run_evaluator.py    | 385 ++++++++++++++++++
 langchain/evaluation/schema.py                |  16 +
 langchain/schema/messages.py                  |   2 +-
 tests/integration_tests/client/__init__.py    |   0
 .../client/test_runner_utils.py               |  81 ++++
 .../evaluation/run_evaluators/test_loading.py | 114 ++++++
 tests/unit_tests/evaluation/test_loading.py   |  15 +
 13 files changed, 730 insertions(+), 3 deletions(-)
 create mode 100644 langchain/evaluation/run_evaluators/loading.py
 create mode 100644 langchain/evaluation/run_evaluators/string_run_evaluator.py
 create mode 100644 tests/integration_tests/client/__init__.py
 create mode 100644 tests/integration_tests/client/test_runner_utils.py
 create mode 100644 tests/unit_tests/evaluation/run_evaluators/test_loading.py
diff --git a/langchain/evaluation/criteria/eval_chain.py b/langchain/evaluation/criteria/eval_chain.py
index ea6d9119f2..ef17f51f7d 100644
--- a/langchain/evaluation/criteria/eval_chain.py
+++ b/langchain/evaluation/criteria/eval_chain.py
@@ -99,6 +99,8 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
 
     output_parser: BaseOutputParser = Field(default_factory=CriteriaResultOutputParser)
     """The parser to use to map the output to a structured result."""
+    criteria_names: List[str] = Field(default_factory=list)
+    """The names of the criteria being evaluated."""
 
     class Config:
         """Configuration for the QAEvalChain."""
@@ -107,12 +109,24 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
 
     @property
     def requires_reference(self) -> bool:
+        """Whether the evaluation requires a reference text."""
         return "reference" in self.prompt.input_variables
 
     @property
     def requires_input(self) -> bool:
         return True
 
+    @property
+    def evaluation_name(self) -> str:
+        """Get the name of the evaluation.
+
+        Returns
+        -------
+        str
+            The name of the evaluation.
+        """
+        return " ".join(self.criteria_names)
+
     @property
     def _skip_reference_warning(self) -> str:
         """Warning to show when reference is ignored."""
@@ -266,9 +280,15 @@ class CriteriaEvalChain(StringEvaluator, LLMEvalChain, LLMChain):
             )
 
         criteria_ = cls.resolve_criteria(criteria)
+        criteria_names = list(criteria_.keys())
         criteria_str = " ".join(f"{k}: {v}" for k, v in criteria_.items())
         prompt_ = prompt.partial(criteria=criteria_str)
-        return cls(llm=llm, prompt=prompt_, **kwargs)
+        return cls(
+            llm=llm,
+            prompt=prompt_,
+            criteria_names=criteria_names,
+            **kwargs,
+        )
 
     def _get_eval_input(
         self,
diff --git a/langchain/evaluation/loading.py b/langchain/evaluation/loading.py
index 549b9d6cb5..e65f1b8fa6 100644
--- a/langchain/evaluation/loading.py
+++ b/langchain/evaluation/loading.py
@@ -67,9 +67,14 @@ def load_evaluator(
     Examples
     --------
     >>> llm = ChatOpenAI(model="gpt-4", temperature=0)
-    >>> evaluator = load_evaluator(EvaluatorType.QA, llm=llm)
+    >>> evaluator = _load_evaluator("qa", llm=llm)
     """
     llm = llm or ChatOpenAI(model="gpt-4", temperature=0)
+    if evaluator not in _EVALUATOR_MAP:
+        raise ValueError(
+            f"Unknown evaluator type: {evaluator}"
+            f"Valid types are: {list(_EVALUATOR_MAP.keys())}"
+        )
     return _EVALUATOR_MAP[evaluator].from_llm(llm=llm, **kwargs)
 
 
diff --git a/langchain/evaluation/qa/eval_chain.py b/langchain/evaluation/qa/eval_chain.py
index cbcb564eae..8f658f9606 100644
--- a/langchain/evaluation/qa/eval_chain.py
+++ b/langchain/evaluation/qa/eval_chain.py
@@ -49,6 +49,10 @@ class QAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
 
         extra = Extra.ignore
 
+    @property
+    def evaluation_name(self) -> str:
+        return "correctness"
+
     @property
     def requires_reference(self) -> bool:
         return True
@@ -155,10 +159,12 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
 
     @property
     def requires_reference(self) -> bool:
+        """Whether the chain requires a reference string."""
         return True
 
     @property
     def requires_input(self) -> bool:
+        """Whether the chain requires an input string."""
         return True
 
     @classmethod
@@ -170,6 +176,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
                 f"but got {prompt.input_variables}"
             )
 
+    @property
+    def evaluation_name(self) -> str:
+        return "Contextual Accuracy"
+
     @classmethod
     def from_llm(
         cls,
@@ -250,6 +260,10 @@ class ContextQAEvalChain(LLMChain, StringEvaluator, LLMEvalChain):
 class CotQAEvalChain(ContextQAEvalChain):
     """LLM Chain specifically for evaluating QA using chain of thought reasoning."""
 
+    @property
+    def evaluation_name(self) -> str:
+        return "COT Contextual Accuracy"
+
     @classmethod
     def from_llm(
         cls, llm: BaseLanguageModel, prompt: PromptTemplate = COT_PROMPT, **kwargs: Any
diff --git a/langchain/evaluation/run_evaluators/__init__.py b/langchain/evaluation/run_evaluators/__init__.py
index 5aba3e638a..c9ecb4e279 100644
--- a/langchain/evaluation/run_evaluators/__init__.py
+++ b/langchain/evaluation/run_evaluators/__init__.py
@@ -11,6 +11,9 @@ from langchain.evaluation.run_evaluators.implementations import (
     get_qa_evaluator,
     get_trajectory_evaluator,
 )
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
 
 __all__ = [
     "RunEvaluatorChain",
@@ -21,4 +24,5 @@ __all__ = [
     "get_trajectory_evaluator",
     "StringRunEvaluatorInputMapper",
     "ChoicesOutputParser",
+    "StringRunEvaluatorChain",
 ]
diff --git a/langchain/evaluation/run_evaluators/base.py b/langchain/evaluation/run_evaluators/base.py
index b640719c02..dfa90f2d80 100644
--- a/langchain/evaluation/run_evaluators/base.py
+++ b/langchain/evaluation/run_evaluators/base.py
@@ -21,6 +21,10 @@ class RunEvaluatorInputMapper:
     def map(self, run: Run, example: Optional[Example] = None) -> Dict[str, Any]:
         """Maps the Run and Optional[Example] to a dictionary"""
 
+    def __call__(self, run: Run, example: Optional[Example] = None) -> Any:
+        """Maps the Run and Optional[Example] to a dictionary"""
+        return self.map(run, example)
+
 
 class RunEvaluatorOutputParser(BaseOutputParser[EvaluationResult]):
     """Parse the output of a run."""
diff --git a/langchain/evaluation/run_evaluators/loading.py b/langchain/evaluation/run_evaluators/loading.py
new file mode 100644
index 0000000000..25e8d8fb22
--- /dev/null
+++ b/langchain/evaluation/run_evaluators/loading.py
@@ -0,0 +1,69 @@
+""""Loading helpers for run evaluators."""
+
+
+from typing import Any, List, Optional, Sequence, Union
+
+from langchainplus_sdk import RunEvaluator
+
+from langchain.base_language import BaseLanguageModel
+from langchain.chains.base import Chain
+from langchain.evaluation.loading import load_evaluators
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
+from langchain.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain.tools.base import Tool
+
+
+def load_run_evaluators_for_model(
+    evaluators: Sequence[EvaluatorType],
+    model: Union[Chain, BaseLanguageModel, Tool],
+    *,
+    input_key: Optional[str] = None,
+    prediction_key: Optional[str] = None,
+    reference_key: Optional[str] = None,
+    eval_llm: Optional[BaseLanguageModel] = None,
+    **kwargs: Any,
+) -> List[RunEvaluator]:
+    """Load evaluators specified by a list of evaluator types.
+
+    Parameters
+    ----------
+    evaluators : Sequence[EvaluatorType]
+        The list of evaluator types to load.
+    model : Union[Chain, BaseLanguageModel, Tool]
+        The model to evaluate. Used to infer how to parse the run.
+    input_key : Optional[str], a chain run's input key to map
+        to the evaluator's input
+    prediction_key : Optional[str], the key in the run's outputs to
+        represent the Chain prediction
+    reference_key : Optional[str], the key in the dataset example (row)
+        outputs to represent the reference, or ground-truth label
+    eval_llm : BaseLanguageModel, optional
+        The language model to use for evaluation, if none is provided, a default
+        ChatOpenAI gpt-4 model will be used.
+    **kwargs : Any
+        Additional keyword arguments to pass to all evaluators.
+
+    Returns
+    -------
+    List[RunEvaluator]
+        The loaded Run evaluators.
+    """
+    evaluators_ = load_evaluators(evaluators, llm=eval_llm, **kwargs)
+    run_evaluators = []
+    for evaluator in evaluators_:
+        if isinstance(evaluator, StringEvaluator):
+            run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+                model,
+                evaluator,
+                input_key=input_key,
+                prediction_key=prediction_key,
+                reference_key=reference_key,
+            )
+        else:
+            raise NotImplementedError(
+                f"Run evaluator for {evaluator} is not implemented"
+            )
+        run_evaluators.append(run_evaluator)
+    return run_evaluators
diff --git a/langchain/evaluation/run_evaluators/string_run_evaluator.py b/langchain/evaluation/run_evaluators/string_run_evaluator.py
new file mode 100644
index 0000000000..201c0dff1b
--- /dev/null
+++ b/langchain/evaluation/run_evaluators/string_run_evaluator.py
@@ -0,0 +1,385 @@
+"""Run evaluator wrapper for string evaluators."""
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Union
+
+from langchainplus_sdk import EvaluationResult, RunEvaluator
+from langchainplus_sdk.schemas import Example, Run
+
+from langchain.base_language import BaseLanguageModel
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from langchain.chains.base import Chain
+from langchain.evaluation.schema import StringEvaluator
+from langchain.load.dump import dumps
+from langchain.load.load import loads
+from langchain.load.serializable import Serializable
+from langchain.schema import RUN_KEY, messages_from_dict
+from langchain.schema.messages import BaseMessage, get_buffer_string
+from langchain.tools.base import Tool
+
+
+def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
+    if not messages:
+        return []
+    first_message = messages[0]
+    if "lc" in first_message:
+        return [loads(dumps(message)) for message in messages]
+    else:
+        return messages_from_dict(messages)
+
+
+class StringRunMapper(Serializable):
+    """Extract items to evaluate from the run object."""
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["prediction", "input"]
+
+    @abstractmethod
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+
+    def __call__(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        return self.map(run)
+
+
+class LLMStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object."""
+
+    def serialize_chat_messages(self, messages: List[Dict]) -> str:
+        """Extract the input messages from the run."""
+        if isinstance(messages, list) and messages:
+            if isinstance(messages[0], dict):
+                chat_messages = _get_messages_from_run_dict(messages)
+            elif isinstance(messages[0], list):
+                # Runs from Tracer have messages as a list of lists of dicts
+                chat_messages = _get_messages_from_run_dict(messages[0])
+            else:
+                raise ValueError(f"Could not extract messages to evaluate {messages}")
+            return get_buffer_string(chat_messages)
+        raise ValueError(f"Could not extract messages to evaluate {messages}")
+
+    def serialize_inputs(self, inputs: Dict) -> str:
+        if "prompts" in inputs:  # Should we even accept this?
+            input_ = "\n\n".join(inputs["prompts"])
+        elif "prompt" in inputs:
+            input_ = inputs["prompt"]
+        elif "messages" in inputs:
+            input_ = self.serialize_chat_messages(inputs["messages"])
+        else:
+            raise ValueError("LLM Run must have either messages or prompts as inputs.")
+        return input_
+
+    def serialize_outputs(self, outputs: Dict) -> str:
+        if not outputs.get("generations"):
+            raise ValueError("Cannot evaluate LLM Run without generations.")
+        generations: List[Dict] = outputs["generations"]
+        if not generations:
+            raise ValueError("Cannot evaluate LLM run with empty generations.")
+        first_generation: Dict = generations[0]
+        if isinstance(first_generation, list):
+            # Runs from Tracer have generations as a list of lists of dicts
+            # Whereas Runs from the API have a list of dicts
+            first_generation = first_generation[0]
+        if "message" in first_generation:
+            output_ = self.serialize_chat_messages([first_generation["message"]])
+        else:
+            output_ = first_generation["text"]
+        return output_
+
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if run.run_type != "llm":
+            raise ValueError("LLM RunMapper only supports LLM runs.")
+        elif not run.outputs:
+            if run.error:
+                raise ValueError(
+                    f"Cannot evaluate errored LLM run {run.id}: {run.error}"
+                )
+            else:
+                raise ValueError(
+                    f"Run {run.id} has no outputs. Cannot evaluate this run."
+                )
+        else:
+            try:
+                inputs = self.serialize_inputs(run.inputs)
+            except Exception as e:
+                raise ValueError(
+                    f"Could not parse LM input from run inputs {run.inputs}"
+                ) from e
+            try:
+                output_ = self.serialize_outputs(run.outputs)
+            except Exception as e:
+                raise ValueError(
+                    f"Could not parse LM prediction from run outputs {run.outputs}"
+                ) from e
+            return {"input": inputs, "prediction": output_}
+
+
+class ChainStringRunMapper(StringRunMapper):
+    """Extract items to evaluate from the run object from a chain."""
+
+    input_key: str
+    """The key from the model Run's inputs to use as the eval input."""
+    prediction_key: str
+    """The key from the model Run's outputs to use as the eval prediction."""
+
+    @classmethod
+    def from_chain(
+        cls,
+        model: Chain,
+        input_key: Optional[str] = None,
+        prediction_key: Optional[str] = None,
+    ) -> ChainStringRunMapper:
+        """Create a RunMapper from a chain."""
+        error_messages = []
+        if input_key is None:
+            if len(model.input_keys) > 1:
+                error_messages.append(
+                    f"Chain {model.lc_namespace} has multiple input"
+                    " keys. Please specify 'input_key' when loading."
+                )
+            else:
+                input_key = model.input_keys[0]
+        elif input_key not in model.input_keys:
+            error_messages.append(
+                f"Chain {model.lc_namespace} does not have specified"
+                f" input key {input_key}."
+            )
+        if prediction_key is None:
+            if len(model.output_keys) > 1:
+                error_messages.append(
+                    f"Chain {model.lc_namespace} has multiple"
+                    " output keys. Please specify 'prediction_key' when loading."
+                )
+            else:
+                prediction_key = model.output_keys[0]
+        elif prediction_key not in model.output_keys:
+            error_messages.append(
+                f"Chain {model.lc_namespace} does not have specified"
+                f" prediction_key {prediction_key}."
+            )
+        if error_messages:
+            raise ValueError("\n".join(error_messages))
+        if input_key is None or prediction_key is None:
+            # This should never happen, but mypy doesn't know that.
+            raise ValueError(f"Chain {model.lc_namespace} has no input or output keys.")
+        return cls(input_key=input_key, prediction_key=prediction_key)
+
+    def map(self, run: Run) -> Dict[str, str]:
+        """Maps the Run to a dictionary."""
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        if run.run_type != "chain":
+            raise ValueError("Chain RunMapper only supports Chain runs.")
+        if self.input_key not in run.inputs:
+            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
+        elif self.prediction_key not in run.outputs:
+            raise ValueError(
+                f"Run {run.id} does not have prediction key {self.prediction_key}."
+            )
+        else:
+            return {
+                "input": run.inputs[self.input_key],
+                "prediction": run.outputs[self.prediction_key],
+            }
+
+
+class ToolStringRunMapper(StringRunMapper):
+    """Map an input to the tool."""
+
+    def map(self, run: Run) -> Dict[str, str]:
+        if not run.outputs:
+            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
+        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}
+
+
+class StringExampleMapper(Serializable):
+    """Map an example, or row in the dataset, to the inputs of an evaluation."""
+
+    reference_key: Optional[str] = None
+
+    @property
+    def output_keys(self) -> List[str]:
+        """The keys to extract from the run."""
+        return ["reference"]
+
+    def serialize_chat_messages(self, messages: List[Dict]) -> str:
+        """Extract the input messages from the run."""
+        chat_messages = _get_messages_from_run_dict(messages)
+        return get_buffer_string(chat_messages)
+
+    def map(self, example: Example) -> Dict[str, str]:
+        """Maps the Example, or dataset row to a dictionary."""
+        if not example.outputs:
+            raise ValueError(
+                f"Example {example.id} has no outputs to use as a reference."
+            )
+        if self.reference_key is None:
+            if len(example.outputs) > 1:
+                raise ValueError(
+                    f"Example {example.id} has multiple outputs, so you must"
+                    " specify a reference_key."
+                )
+            else:
+                output = list(example.outputs.values())[0]
+                return {
+                    "reference": self.serialize_chat_messages([output])
+                    if isinstance(output, dict)
+                    and output.get("type")
+                    and output.get("data")
+                    else output
+                }
+        elif self.reference_key not in example.outputs:
+            raise ValueError(
+                f"Example {example.id} does not have reference key"
+                f" {self.reference_key}."
+            )
+        return {"reference": example.outputs[self.reference_key]}
+
+    def __call__(self, example: Example) -> Dict[str, str]:
+        """Maps the Run and Example to a dictionary."""
+        if not example.outputs:
+            raise ValueError(
+                f"Example {example.id} has no outputs to use as areference label."
+            )
+        return self.map(example)
+
+
+class StringRunEvaluatorChain(Chain, RunEvaluator):
+    """Evaluate Run and optional examples."""
+
+    run_mapper: StringRunMapper
+    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""
+    example_mapper: Optional[StringExampleMapper] = None
+    """Maps the Example (dataset row) to a dictionary
+    with a 'reference' string."""
+    name: str
+    """The name of the evaluation metric."""
+    string_evaluator: StringEvaluator
+    """The evaluation chain."""
+
+    @property
+    def input_keys(self) -> List[str]:
+        return ["run", "example"]
+
+    @property
+    def output_keys(self) -> List[str]:
+        return ["feedback"]
+
+    def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]:
+        run: Run = inputs["run"]
+        example: Optional[Example] = inputs.get("example")
+        evaluate_strings_inputs = self.run_mapper(run)
+        if example and self.example_mapper:
+            evaluate_strings_inputs.update(self.example_mapper(example))
+        elif self.string_evaluator.requires_reference:
+            raise ValueError(
+                f"Evaluator {self.name} requires an reference"
+                " example from the dataset,"
+                f" but none was provided for run {run.id}."
+            )
+        return evaluate_strings_inputs
+
+    def _prepare_output(self, output: Dict[str, Any]) -> EvaluationResult:
+        evaluation_result = EvaluationResult(key=self.name, **output)
+        if RUN_KEY in output:
+            # TODO: Not currently surfaced. Update
+            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
+        return evaluation_result
+
+    def _call(
+        self,
+        inputs: Dict[str, str],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = self.string_evaluator.evaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    async def _acall(
+        self,
+        inputs: Dict[str, str],
+        run_manager: AsyncCallbackManagerForChainRun | None = None,
+    ) -> Dict[str, Any]:
+        """Call the evaluation chain."""
+        evaluate_strings_inputs = self._prepare_input(inputs)
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        callbacks = _run_manager.get_child()
+        chain_output = await self.string_evaluator.aevaluate_strings(
+            **evaluate_strings_inputs,
+            callbacks=callbacks,
+        )
+        evaluation_result = self._prepare_output(chain_output)
+        return {"feedback": evaluation_result}
+
+    def evaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        return self({"run": run, "example": example})["feedback"]
+
+    async def aevaluate_run(
+        self, run: Run, example: Optional[Example] = None
+    ) -> EvaluationResult:
+        """Evaluate an example."""
+        result = await self.acall({"run": run, "example": example})
+        return result["feedback"]
+
+    @classmethod
+    def from_model_and_evaluator(
+        cls,
+        model: Union[Chain, BaseLanguageModel, Tool],
+        evaluator: StringEvaluator,
+        input_key: Optional[str] = None,
+        prediction_key: Optional[str] = None,
+        reference_key: Optional[str] = None,
+    ) -> StringRunEvaluatorChain:
+        """Create a StringRunEvaluatorChain from a model and evaluator."""
+        if isinstance(model, BaseLanguageModel):
+            run_mapper: StringRunMapper = LLMStringRunMapper()
+        elif isinstance(model, Chain):
+            run_mapper = ChainStringRunMapper.from_chain(
+                model, input_key=input_key, prediction_key=prediction_key
+            )
+        elif isinstance(model, Tool):
+            run_mapper = ToolStringRunMapper()
+        else:
+            raise NotImplementedError(
+                f"{cls.__name__}.from_model_and_evaluator({type(model)})"
+                " not yet implemented."
+                "Expected one of [BaseLanguageModel, Chain, Tool]."
+            )
+        if reference_key is not None or isinstance(model, BaseLanguageModel):
+            example_mapper = StringExampleMapper(reference_key=reference_key)
+        elif evaluator.requires_reference:
+            # We could potentially auto-infer if there is only one string in the
+            # example, but it's preferred to raise earlier.
+            raise ValueError(
+                f"Evaluator {evaluator.evaluation_name} requires a reference"
+                " example from the dataset. Please specify the reference key from"
+                " amongst the dataset outputs keys."
+            )
+        else:
+            example_mapper = None
+        return cls(
+            name=evaluator.evaluation_name,
+            run_mapper=run_mapper,
+            example_mapper=example_mapper,
+            string_evaluator=evaluator,
+        )
diff --git a/langchain/evaluation/schema.py b/langchain/evaluation/schema.py
index 1f76aeecfb..a1a1951ba2 100644
--- a/langchain/evaluation/schema.py
+++ b/langchain/evaluation/schema.py
@@ -91,6 +91,14 @@ class _EvalArgsMixin:
 class StringEvaluator(_EvalArgsMixin, ABC):
     """Protocol for evaluating strings."""
 
+    @property
+    def evaluation_name(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def requires_reference(self) -> bool:
+        return False
+
     @abstractmethod
     def _evaluate_strings(
         self,
@@ -110,6 +118,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
             **kwargs: additional keyword arguments, including callbacks, tags, etc.
         Returns:
             dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                    - score: the score of the evaluation, if applicable.
+                    - value: the string value of the evaluation, if applicable.
+                    - reasoning: the reasoning for the evaluation, if applicable.
         """
 
     async def _aevaluate_strings(
@@ -131,6 +143,10 @@ class StringEvaluator(_EvalArgsMixin, ABC):
             **kwargs: additional keyword arguments, including callbacks, tags, etc.
         Returns:
             dict: The evaluation results containing the score or value.
+                It is recommended that the dictionary contain the following keys:
+                    - score: the score of the evaluation, if applicable.
+                    - value: the string value of the evaluation, if applicable.
+                    - reasoning: the reasoning for the evaluation, if applicable.
         """
         raise NotImplementedError(
             f"{self.__class__.__name__} hasn't implemented an "
diff --git a/langchain/schema/messages.py b/langchain/schema/messages.py
index c03ae20358..a0cbf978d2 100644
--- a/langchain/schema/messages.py
+++ b/langchain/schema/messages.py
@@ -168,7 +168,7 @@ def _message_from_dict(message: dict) -> BaseMessage:
     elif _type == "chat":
         return ChatMessage(**message["data"])
     else:
-        raise ValueError(f"Got unexpected type: {_type}")
+        raise ValueError(f"Got unexpected message type: {_type}")
 
 
 def messages_from_dict(messages: List[dict]) -> List[BaseMessage]:
diff --git a/tests/integration_tests/client/__init__.py b/tests/integration_tests/client/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/integration_tests/client/test_runner_utils.py b/tests/integration_tests/client/test_runner_utils.py
new file mode 100644
index 0000000000..d8f6318527
--- /dev/null
+++ b/tests/integration_tests/client/test_runner_utils.py
@@ -0,0 +1,81 @@
+import sys
+from typing import Iterator
+from uuid import uuid4
+
+import pytest
+from langchainplus_sdk import LangChainPlusClient as Client
+
+from langchain.chains.llm import LLMChain
+from langchain.chat_models import ChatOpenAI
+from langchain.client.runner_utils import run_on_dataset
+from langchain.evaluation import EvaluatorType
+from langchain.evaluation.run_evaluators.loading import load_run_evaluators_for_model
+from langchain.llms.openai import OpenAI
+
+
+@pytest.fixture(
+    scope="module",
+)
+def dataset_name() -> Iterator[str]:
+    import pandas as pd
+
+    client = Client()
+    df = pd.DataFrame(
+        [
+            {"question": "5", "answer": 5.0},
+            {"question": "5 + 3", "answer": 8.0},
+            {"question": "2^3.171", "answer": 9.006708689094099},
+            {"question": "  2 ^3.171 ", "answer": 9.006708689094099},
+        ]
+    )
+
+    uid = str(uuid4())[-8:]
+    _dataset_name = f"lcp integration tests - {uid}"
+    client.upload_dataframe(
+        df,
+        name=_dataset_name,
+        input_keys=["question"],
+        output_keys=["answer"],
+        description="Integration test dataset",
+    )
+    yield _dataset_name
+
+
+def test_chat_model(dataset_name: str) -> None:
+    llm = ChatOpenAI(temperature=0)
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        llm,
+        run_evaluators=evaluators,
+    )
+    print("CHAT", results, file=sys.stderr)
+
+
+def test_llm(dataset_name: str) -> None:
+    llm = OpenAI(temperature=0)
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], llm, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        llm,
+        run_evaluators=evaluators,
+    )
+    print("LLM", results, file=sys.stderr)
+
+
+def test_chain(dataset_name: str) -> None:
+    llm = ChatOpenAI(temperature=0)
+    chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
+    evaluators = load_run_evaluators_for_model(
+        [EvaluatorType.QA, EvaluatorType.CRITERIA], chain, reference_key="answer"
+    )
+    results = run_on_dataset(
+        dataset_name,
+        lambda: chain,
+        run_evaluators=evaluators,
+    )
+    print("CHAIN", results, file=sys.stderr)
diff --git a/tests/unit_tests/evaluation/run_evaluators/test_loading.py b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
new file mode 100644
index 0000000000..b318521599
--- /dev/null
+++ b/tests/unit_tests/evaluation/run_evaluators/test_loading.py
@@ -0,0 +1,114 @@
+"""Test the loading function for evalutors."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from langchain.callbacks.tracers.run_collector import RunCollectorCallbackHandler
+from langchain.evaluation.loading import load_evaluators
+from langchain.evaluation.run_evaluators.string_run_evaluator import (
+    StringRunEvaluatorChain,
+)
+from langchain.evaluation.schema import StringEvaluator
+from tests.unit_tests.chains.test_base import FakeChain
+from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_llm(evaluator_type: str) -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    model = FakeLLM(queries={"text": "Foo output"}, sequential_responses=True)
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "generations"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model.predict("Foo input", callbacks=[callback])
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"generations": "Foo output"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Foo input"
+    assert result["prediction"] == "Foo output"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Foo output"
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_chat_model(evaluator_type: str) -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    model = FakeChatModel()
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "generations"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model.predict("Foo input", callbacks=[callback])
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"generations": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Human: Foo input"
+    assert result["prediction"] == "AI: fake response"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
+
+
+@pytest.mark.parametrize("evaluator_type", ["qa", "cot_qa", "context_qa", "criteria"])
+def test_load_string_run_evaluators_with_chain(evaluator_type: str) -> None:
+    model = FakeChain(
+        the_input_keys=["an_input", "another_input"],
+    )
+    fake_llm = FakeChatModel()
+    evaluator = load_evaluators([evaluator_type], llm=fake_llm)[0]  # type: ignore
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    # No input key
+    with pytest.raises(ValueError, match="multiple input keys"):
+        StringRunEvaluatorChain.from_model_and_evaluator(model, evaluator)
+    with pytest.raises(ValueError, match="does not have specified"):
+        StringRunEvaluatorChain.from_model_and_evaluator(
+            model, evaluator, input_key="some_input"
+        )
+    kwargs = {}
+    if evaluator.requires_reference:
+        kwargs["reference_key"] = "label_column"
+    run_evaluator = StringRunEvaluatorChain.from_model_and_evaluator(
+        model, evaluator, input_key="an_input", **kwargs
+    )
+    callback = RunCollectorCallbackHandler()
+    model(
+        {"an_input": "Foo input", "another_input": "Another fake response"},
+        callbacks=[callback],
+    )
+    run = callback.traced_runs[0]
+    example = MagicMock()
+    example.inputs = {}
+    example.outputs = {"label_column": "Another fake response"}
+    result = run_evaluator._prepare_input({"run": run, "example": example})
+    assert result["input"] == "Foo input"
+    assert result["prediction"] == "baz"
+    if evaluator.requires_reference:
+        assert "reference" in result
+        assert result["reference"] == "Another fake response"
diff --git a/tests/unit_tests/evaluation/test_loading.py b/tests/unit_tests/evaluation/test_loading.py
index 27c538d8b3..e707246fb0 100644
--- a/tests/unit_tests/evaluation/test_loading.py
+++ b/tests/unit_tests/evaluation/test_loading.py
@@ -3,7 +3,9 @@
 import pytest
 
 from langchain.evaluation.loading import EvaluatorType, load_evaluators
+from langchain.evaluation.schema import StringEvaluator
 from tests.unit_tests.llms.fake_chat_model import FakeChatModel
+from tests.unit_tests.llms.fake_llm import FakeLLM
 
 
 @pytest.mark.parametrize("evaluator_type", EvaluatorType)
@@ -14,3 +16,16 @@ def test_load_evaluators(evaluator_type: EvaluatorType) -> None:
 
     # Test as string
     load_evaluators([evaluator_type.value], llm=fake_llm)  # type: ignore
+
+
+def test_criteria_eval_chain_requires_reference() -> None:
+    """Test loading evaluators."""
+    fake_llm = FakeLLM(
+        queries={"text": "The meaning of life\nCORRECT"}, sequential_responses=True
+    )
+    evaluator = load_evaluators(
+        [EvaluatorType.CRITERIA], llm=fake_llm, requires_reference=True
+    )[0]
+    if not isinstance(evaluator, StringEvaluator):
+        raise ValueError("Evaluator is not a string evaluator")
+    assert evaluator.requires_reference