Enhanced pairwise error (#11131)

1 year ago · d3c2ca5656
parent b7e9db5e73
commit d3c2ca5656
3 changed files with 21 additions and 3 deletions
--- a/docs/docs_skeleton/docs/guides/evaluation/comparison/index.mdx
+++ b/docs/docs_skeleton/docs/guides/evaluation/comparison/index.mdx
@ -16,6 +16,10 @@ Here's a summary of the key methods and properties of a comparison evaluator:
 - `requires_input`: This property indicates whether this evaluator requires an input string.
 - `requires_reference`: This property specifies whether this evaluator requires a reference label.

+:::note LangSmith Support
+The [run_on_dataset](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.smith) evaluation method is designed to evaluate only a single model at a time, and thus, doesn't support these evaluators.
+:::
+
 Detailed information about creating custom evaluators and the available built-in comparison evaluators is provided in the following sections.

 import DocCardList from "@theme/DocCardList";
--- a/docs/extras/integrations/llms/titan_takeoff.ipynb
+++ b/docs/extras/integrations/llms/titan_takeoff.ipynb
@ -17,7 +17,7 @@
   "source": [
    "## Installation\n",
    "\n",
-    "To get started with Iris Takeoff, all you need is to have docker and python installed on your local system. If you wish to use the server with gpu suport, then you will need to install docker with cuda support.\n",
+    "To get started with Iris Takeoff, all you need is to have docker and python installed on your local system. If you wish to use the server with gpu support, then you will need to install docker with cuda support.\n",
    "\n",
    "For Mac and Windows users, make sure you have the docker daemon running! You can check this by running docker ps in your terminal. To start the daemon, open the docker desktop app.\n",
    "\n",
@ -157,7 +157,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from langchain.prompts import PromptTemplate\nfrom langchain.chains import LLMChain\n",
+    "from langchain.prompts import PromptTemplate\n",
+    "from langchain.chains import LLMChain\n",
    "\n",
    "llm = TitanTakeoff()\n",
    "\n",
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@ -28,7 +28,11 @@ from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
 from langchain.callbacks.tracers.langchain import LangChainTracer, wait_for_all_tracers
 from langchain.chains.base import Chain
 from langchain.evaluation.loading import load_evaluator
-from langchain.evaluation.schema import EvaluatorType, StringEvaluator
+from langchain.evaluation.schema import (
+    EvaluatorType,
+    PairwiseStringEvaluator,
+    StringEvaluator,
+)
 from langchain.schema import ChatResult, LLMResult
 from langchain.schema.language_model import BaseLanguageModel
 from langchain.schema.messages import BaseMessage, messages_from_dict
@ -486,6 +490,15 @@ def _construct_run_evaluator(
            reference_key=reference_key,
            tags=[eval_type_tag],
        )
+    elif isinstance(evaluator_, PairwiseStringEvaluator):
+        raise NotImplementedError(
+            f"Run evaluator for {eval_type_tag} is not implemented."
+            " PairwiseStringEvaluators compare the outputs of two different models"
+            " rather than the output of a single model."
+            " Did you mean to use a StringEvaluator instead?"
+            "\nSee: https://python.langchain.com/docs/guides/evaluation/string/"
+        )
+
    else:
        raise NotImplementedError(
            f"Run evaluator for {eval_type_tag} is not implemented"