diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py
index 291d3a9fc1..403348c523 100644
--- a/libs/langchain/langchain/smith/evaluation/runner_utils.py
+++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py
@@ -116,6 +116,7 @@ class TestResult(dict):
                 **{f.key: f.score for f in feedback},
                 "input": result["input"],
                 "output": result["output"],
+                "execution_time": result["execution_time"],
             }
             if "reference" in result:
                 r["reference"] = result["reference"]
@@ -418,12 +419,17 @@ def _determine_input_key(
     if config.input_key:
         input_key = config.input_key
         if run_inputs and input_key not in run_inputs:
-            raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}")
+            logger.warning(
+                f"Input key {input_key} not in chain's specified"
+                f" input keys {run_inputs}. Evaluation behavior may be undefined."
+            )
     elif run_inputs and len(run_inputs) == 1:
         input_key = run_inputs[0]
     elif run_inputs is not None and len(run_inputs) > 1:
-        raise ValueError(
-            f"Must specify input key for model with multiple inputs: {run_inputs}"
+        logger.warning(
+            f"Chain expects multiple input keys: {run_inputs},"
+            f" Evaluator is likely to fail. Evaluation behavior may be undefined."
+            " Specify an input_key in the RunEvalConfig to avoid this warning."
         )
 
     return input_key
@@ -437,15 +443,17 @@ def _determine_prediction_key(
     if config.prediction_key:
         prediction_key = config.prediction_key
         if run_outputs and prediction_key not in run_outputs:
-            raise ValueError(
-                f"Prediction key {prediction_key} not in run outputs {run_outputs}"
+            logger.warning(
+                f"Prediction key {prediction_key} not in chain's specified"
+                f" output keys {run_outputs}. Evaluation behavior may be undefined."
             )
     elif run_outputs and len(run_outputs) == 1:
         prediction_key = run_outputs[0]
     elif run_outputs is not None and len(run_outputs) > 1:
-        raise ValueError(
-            f"Must specify prediction key for model"
-            f" with multiple outputs: {run_outputs}"
+        logger.warning(
+            f"Chain expects multiple output keys: {run_outputs},"
+            f" Evaluation behavior may be undefined. Specify a prediction_key"
+            " in the RunEvalConfig to avoid this warning."
         )
     return prediction_key
 
@@ -978,6 +986,14 @@ def _collect_test_results(
                 all_eval_results.update(
                     {example_id: v for (_, example_id), v in eval_results.items()}
                 )
+            elif isinstance(callback, LangChainTracer):
+                run = callback.latest_run
+                execution_time = (
+                    (run.end_time - run.start_time).total_seconds()
+                    if run and run.end_time
+                    else None
+                )
+
     results = {}
     for example, output in zip(examples, batch_results):
         feedback = all_eval_results.get(str(example.id), [])
@@ -985,6 +1001,7 @@ def _collect_test_results(
             "output": output,
             "input": example.inputs,
             "feedback": feedback,
+            "execution_time": execution_time,
         }
         if example.outputs:
             results[str(example.id)]["reference"] = example.outputs
diff --git a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
index 4974deefb2..3d3a6b1a40 100644
--- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
+++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py
@@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
 from unittest import mock
 
 import pytest
+from freezegun import freeze_time
 from langsmith.client import Client
 from langsmith.schemas import Dataset, Example
 
@@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
 
 
 @pytest.mark.asyncio
+@freeze_time("2023-01-01")
 async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
     dataset = Dataset(
         id=uuid.uuid4(),
@@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
                     else None
                 },
                 "feedback": [],
+                # No run since we mock the call to the llm above
+                "execution_time": None,
             }
             for example in examples
         }