Add execution time (#13542)

And warn instead of raising an error, since the chain API is too inconsistent.
10 months ago · c56faa6ef1
parent 0fb5f857f9
commit c56faa6ef1
2 changed files with 29 additions and 8 deletions
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@ -116,6 +116,7 @@ class TestResult(dict):
                **{f.key: f.score for f in feedback},
                "input": result["input"],
                "output": result["output"],
+                "execution_time": result["execution_time"],
            }
            if "reference" in result:
                r["reference"] = result["reference"]
@ -418,12 +419,17 @@ def _determine_input_key(
    if config.input_key:
        input_key = config.input_key
        if run_inputs and input_key not in run_inputs:
-            raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}")
+            logger.warning(
+                f"Input key {input_key} not in chain's specified"
+                f" input keys {run_inputs}. Evaluation behavior may be undefined."
+            )
    elif run_inputs and len(run_inputs) == 1:
        input_key = run_inputs[0]
    elif run_inputs is not None and len(run_inputs) > 1:
-        raise ValueError(
-            f"Must specify input key for model with multiple inputs: {run_inputs}"
+        logger.warning(
+            f"Chain expects multiple input keys: {run_inputs},"
+            f" Evaluator is likely to fail. Evaluation behavior may be undefined."
+            " Specify an input_key in the RunEvalConfig to avoid this warning."
        )

    return input_key
@ -437,15 +443,17 @@ def _determine_prediction_key(
    if config.prediction_key:
        prediction_key = config.prediction_key
        if run_outputs and prediction_key not in run_outputs:
-            raise ValueError(
-                f"Prediction key {prediction_key} not in run outputs {run_outputs}"
+            logger.warning(
+                f"Prediction key {prediction_key} not in chain's specified"
+                f" output keys {run_outputs}. Evaluation behavior may be undefined."
            )
    elif run_outputs and len(run_outputs) == 1:
        prediction_key = run_outputs[0]
    elif run_outputs is not None and len(run_outputs) > 1:
-        raise ValueError(
-            f"Must specify prediction key for model"
-            f" with multiple outputs: {run_outputs}"
+        logger.warning(
+            f"Chain expects multiple output keys: {run_outputs},"
+            f" Evaluation behavior may be undefined. Specify a prediction_key"
+            " in the RunEvalConfig to avoid this warning."
        )
    return prediction_key

@ -978,6 +986,14 @@ def _collect_test_results(
                all_eval_results.update(
                    {example_id: v for (_, example_id), v in eval_results.items()}
                )
+            elif isinstance(callback, LangChainTracer):
+                run = callback.latest_run
+                execution_time = (
+                    (run.end_time - run.start_time).total_seconds()
+                    if run and run.end_time
+                    else None
+                )
+
    results = {}
    for example, output in zip(examples, batch_results):
        feedback = all_eval_results.get(str(example.id), [])
@ -985,6 +1001,7 @@ def _collect_test_results(
            "output": output,
            "input": example.inputs,
            "feedback": feedback,
+            "execution_time": execution_time,
        }
        if example.outputs:
            results[str(example.id)]["reference"] = example.outputs
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
 from unittest import mock

 import pytest
+from freezegun import freeze_time
 from langsmith.client import Client
 from langsmith.schemas import Dataset, Example

@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:


@pytest.mark.asyncio
+@freeze_time("2023-01-01")
 async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
    dataset = Dataset(
        id=uuid.uuid4(),
@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
                    else None
                },
                "feedback": [],
+                # No run since we mock the call to the llm above
+                "execution_time": None,
            }
            for example in examples
        }