Add execution time (#13542)

And warn instead of raising an error, since the chain API is too
inconsistent.
pull/13549/head
William FH 10 months ago committed by GitHub
parent 0fb5f857f9
commit c56faa6ef1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -116,6 +116,7 @@ class TestResult(dict):
**{f.key: f.score for f in feedback},
"input": result["input"],
"output": result["output"],
"execution_time": result["execution_time"],
}
if "reference" in result:
r["reference"] = result["reference"]
@ -418,12 +419,17 @@ def _determine_input_key(
if config.input_key:
input_key = config.input_key
if run_inputs and input_key not in run_inputs:
raise ValueError(f"Input key {input_key} not in run inputs {run_inputs}")
logger.warning(
f"Input key {input_key} not in chain's specified"
f" input keys {run_inputs}. Evaluation behavior may be undefined."
)
elif run_inputs and len(run_inputs) == 1:
input_key = run_inputs[0]
elif run_inputs is not None and len(run_inputs) > 1:
raise ValueError(
f"Must specify input key for model with multiple inputs: {run_inputs}"
logger.warning(
f"Chain expects multiple input keys: {run_inputs},"
f" Evaluator is likely to fail. Evaluation behavior may be undefined."
" Specify an input_key in the RunEvalConfig to avoid this warning."
)
return input_key
@ -437,15 +443,17 @@ def _determine_prediction_key(
if config.prediction_key:
prediction_key = config.prediction_key
if run_outputs and prediction_key not in run_outputs:
raise ValueError(
f"Prediction key {prediction_key} not in run outputs {run_outputs}"
logger.warning(
f"Prediction key {prediction_key} not in chain's specified"
f" output keys {run_outputs}. Evaluation behavior may be undefined."
)
elif run_outputs and len(run_outputs) == 1:
prediction_key = run_outputs[0]
elif run_outputs is not None and len(run_outputs) > 1:
raise ValueError(
f"Must specify prediction key for model"
f" with multiple outputs: {run_outputs}"
logger.warning(
f"Chain expects multiple output keys: {run_outputs},"
f" Evaluation behavior may be undefined. Specify a prediction_key"
" in the RunEvalConfig to avoid this warning."
)
return prediction_key
@ -978,6 +986,14 @@ def _collect_test_results(
all_eval_results.update(
{example_id: v for (_, example_id), v in eval_results.items()}
)
elif isinstance(callback, LangChainTracer):
run = callback.latest_run
execution_time = (
(run.end_time - run.start_time).total_seconds()
if run and run.end_time
else None
)
results = {}
for example, output in zip(examples, batch_results):
feedback = all_eval_results.get(str(example.id), [])
@ -985,6 +1001,7 @@ def _collect_test_results(
"output": output,
"input": example.inputs,
"feedback": feedback,
"execution_time": execution_time,
}
if example.outputs:
results[str(example.id)]["reference"] = example.outputs

@ -5,6 +5,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
from unittest import mock
import pytest
from freezegun import freeze_time
from langsmith.client import Client
from langsmith.schemas import Dataset, Example
@ -239,6 +240,7 @@ def test_run_chat_model_all_formats(inputs: Dict[str, Any]) -> None:
@pytest.mark.asyncio
@freeze_time("2023-01-01")
async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
dataset = Dataset(
id=uuid.uuid4(),
@ -341,6 +343,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
else None
},
"feedback": [],
# No run since we mock the call to the llm above
"execution_time": None,
}
for example in examples
}

Loading…
Cancel
Save