diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index 291d3a9fc1..403348c523 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -116,6 +116,7 @@ class TestResult(dict): **{f.key: f.score for f in feedback}, "input": result["input"], "output": result["output"], + "execution_time": result["execution_time"], } if "reference" in result: r["reference"] = result["reference"] @@ -418,12 +419,17 @@ def _determine_input_key( if config.input_key: input_key = config.input_key if run_inputs and input_key not in run_inputs: - raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}") + logger.warning( + f"Input key {input_key} not in chain's specified" + f" input keys {run_inputs}. Evaluation behavior may be undefined." + ) elif run_inputs and len(run_inputs) == 1: input_key = run_inputs[0] elif run_inputs is not None and len(run_inputs) > 1: - raise ValueError( - f"Must specify input key for model with multiple inputs: {run_inputs}" + logger.warning( + f"Chain expects multiple input keys: {run_inputs}," + f" Evaluator is likely to fail. Evaluation behavior may be undefined." + " Specify an input_key in the RunEvalConfig to avoid this warning." ) return input_key @@ -437,15 +443,17 @@ def _determine_prediction_key( if config.prediction_key: prediction_key = config.prediction_key if run_outputs and prediction_key not in run_outputs: - raise ValueError( - f"Prediction key {prediction_key} not in run outputs {run_outputs}" + logger.warning( + f"Prediction key {prediction_key} not in chain's specified" + f" output keys {run_outputs}. Evaluation behavior may be undefined." ) elif run_outputs and len(run_outputs) == 1: prediction_key = run_outputs[0] elif run_outputs is not None and len(run_outputs) > 1: - raise ValueError( - f"Must specify prediction key for model" - f" with multiple outputs: {run_outputs}" + logger.warning( + f"Chain expects multiple output keys: {run_outputs}," + f" Evaluation behavior may be undefined. Specify a prediction_key" + " in the RunEvalConfig to avoid this warning." ) return prediction_key @@ -978,6 +986,14 @@ def _collect_test_results( all_eval_results.update( {example_id: v for (_, example_id), v in eval_results.items()} ) + elif isinstance(callback, LangChainTracer): + run = callback.latest_run + execution_time = ( + (run.end_time - run.start_time).total_seconds() + if run and run.end_time + else None + ) + results = {} for example, output in zip(examples, batch_results): feedback = all_eval_results.get(str(example.id), []) @@ -985,6 +1001,7 @@ def _collect_test_results( "output": output, "input": example.inputs, "feedback": feedback, + "execution_time": execution_time, } if example.outputs: results[str(example.id)]["reference"] = example.outputs diff --git a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py index 4974deefb2..3d3a6b1a40 100644 --- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py +++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py @@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union from unittest import mock import pytest +from freezegun import freeze_time from langsmith.client import Client from langsmith.schemas import Dataset, Example @@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None: @pytest.mark.asyncio +@freeze_time("2023-01-01") async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: dataset = Dataset( id=uuid.uuid4(), @@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: else None }, "feedback": [], + # No run since we mock the call to the llm above + "execution_time": None, } for example in examples }