Add support for project metadata in run_on_dataset (#11200)

pull/10957/head^2
William FH 11 months ago committed by GitHub
parent b11f21c25f
commit 73693c18fc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -862,6 +862,7 @@ def _prepare_eval_run(
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
project_name: str,
project_metadata: Optional[Dict[str, Any]] = None,
) -> Tuple[MCF, str, Dataset, List[Example]]:
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
dataset = client.read_dataset(dataset_name=dataset_name)
@ -869,6 +870,7 @@ def _prepare_eval_run(
project = client.create_project(
project_name,
reference_dataset_id=dataset.id,
project_extra={"metadata": project_metadata} if project_metadata else {},
)
except ValueError as e:
if "already exists " not in str(e):
@ -895,10 +897,15 @@ def _prepare_run_on_dataset(
tags: Optional[List[str]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
concurrency_level: int = 5,
project_metadata: Optional[Dict[str, Any]] = None,
) -> Tuple[MCF, str, List[Example], List[RunnableConfig]]:
project_name = project_name or name_generation.random_name()
wrapped_model, project_name, dataset, examples = _prepare_eval_run(
client, dataset_name, llm_or_chain_factory, project_name
client,
dataset_name,
llm_or_chain_factory,
project_name,
project_metadata=project_metadata,
)
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
run_evaluators = _setup_evaluation(
@ -958,126 +965,41 @@ def _collect_test_results(
)
_INPUT_MAPPER_DEP_WARNING = (
"The input_mapper argument is deprecated and "
"will be removed in a future release. Please add a "
" RunnableLambda to your chain to map inputs to the expected format"
" instead. Example:\n"
"def construct_chain():\n"
" my_chain = ...\n"
" input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}\n"
" return input_mapper | my_chain\n"
"run_on_dataset(..., llm_or_chain_factory=construct_chain)\n"
"(See https://api.python.langchain.com/en/latest/schema/"
"langchain.schema.runnable.base.RunnableLambda.html)"
)
async def arun_on_dataset(
client: Client,
client: Optional[Client],
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
evaluation: Optional[smith_eval.RunEvalConfig] = None,
concurrency_level: int = 5,
project_name: Optional[str] = None,
project_metadata: Optional[Dict[str, Any]] = None,
verbose: bool = False,
tags: Optional[List[str]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Asynchronously run the Chain or language model on a dataset
and store traces to the specified project name.
Args:
client: LangSmith client to use to read the dataset, and to
log feedback and run traces.
dataset_name: Name of the dataset to run the chain on.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
evaluation: Optional evaluation configuration to use when evaluating
concurrency_level: The number of async tasks to run concurrently.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
tags: Tags to add to each run in the project.
input_mapper: A function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary containing the run's project name and the
resulting model outputs.
For the synchronous version, see :func:`run_on_dataset`.
Examples
--------
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import smith_eval.RunEvalConfig, arun_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = smith_eval.RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
smith_eval.RunEvalConfig.Criteria("helpfulness"),
smith_eval.RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
await arun_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = smith_eval.RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
input_mapper = kwargs.pop("input_mapper", None)
if input_mapper:
warnings.warn(
_INPUT_MAPPER_DEP_WARNING,
DeprecationWarning,
)
await arun_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
""" # noqa: E501
if kwargs:
warnings.warn(
"The following arguments are deprecated and "
@ -1085,6 +1007,7 @@ async def arun_on_dataset(
f"{kwargs.keys()}.",
DeprecationWarning,
)
client = client or Client()
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
client,
dataset_name,
@ -1094,6 +1017,7 @@ async def arun_on_dataset(
tags,
input_mapper,
concurrency_level,
project_metadata=project_metadata,
)
batch_results = await runnable_utils.gather_with_concurrency(
@ -1120,126 +1044,24 @@ async def arun_on_dataset(
def run_on_dataset(
client: Client,
client: Optional[Client],
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
*,
evaluation: Optional[smith_eval.RunEvalConfig] = None,
concurrency_level: int = 5,
project_name: Optional[str] = None,
project_metadata: Optional[Dict[str, Any]] = None,
verbose: bool = False,
tags: Optional[List[str]] = None,
input_mapper: Optional[Callable[[Dict], Any]] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Run the Chain or language model on a dataset and store traces
to the specified project name.
Args:
client: LangSmith client to use to access the dataset and to
log feedback and run traces.
dataset_name: Name of the dataset to run the chain on.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
evaluation: Configuration for evaluators to run on the
results of the chain
concurrency_level: The number of async tasks to run concurrently.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
verbose: Whether to print progress.
tags: Tags to add to each run in the project.
input_mapper: A function to map to the inputs dictionary from an Example
to the format expected by the model to be evaluated. This is useful if
your model needs to deserialize more complex schema or if your dataset
has inputs with keys that differ from what is expected by your chain
or agent.
Returns:
A dictionary containing the run's project name and the resulting model outputs.
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
Examples
--------
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = smith_eval.RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
smith_eval.RunEvalConfig.Criteria("helpfulness"),
smith_eval.RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = smith_eval.RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
)
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
input_mapper = kwargs.pop("input_mapper", None)
if input_mapper:
warnings.warn(
_INPUT_MAPPER_DEP_WARNING,
DeprecationWarning,
)
""" # noqa: E501
if kwargs:
warnings.warn(
"The following arguments are deprecated and "
@ -1247,6 +1069,7 @@ def run_on_dataset(
f"{kwargs.keys()}.",
DeprecationWarning,
)
client = client or Client()
wrapped_model, project_name, examples, configs = _prepare_run_on_dataset(
client,
dataset_name,
@ -1256,6 +1079,7 @@ def run_on_dataset(
tags,
input_mapper,
concurrency_level,
project_metadata=project_metadata,
)
if concurrency_level == 0:
batch_results = [
@ -1290,3 +1114,114 @@ def run_on_dataset(
except Exception as e:
logger.debug(f"Failed to print aggregate feedback: {repr(e)}")
return results
_RUN_ON_DATASET_DOCSTRING = """
Run the Chain or language model on a dataset and store traces
to the specified project name.
Args:
dataset_name: Name of the dataset to run the chain on.
llm_or_chain_factory: Language model or Chain constructor to run
over the dataset. The Chain constructor is used to permit
independent calls on each example without carrying over state.
evaluation: Configuration for evaluators to run on the
results of the chain
concurrency_level: The number of async tasks to run concurrently.
project_name: Name of the project to store the traces in.
Defaults to {dataset_name}-{chain class name}-{datetime}.
project_metadata: Optional metadata to add to the project.
Useful for storing information the test variant.
(prompt version, model version, etc.)
client: LangSmith client to use to access the dataset and to
log feedback and run traces.
verbose: Whether to print progress.
tags: Tags to add to each run in the project.
Returns:
A dictionary containing the run's project name and the resulting model outputs.
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
Examples
--------
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = smith_eval.RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
smith_eval.RunEvalConfig.Criteria("helpfulness"),
smith_eval.RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = smith_eval.RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
)
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
""" # noqa: E501
run_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING
arun_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING.replace(
"run_on_dataset(", "await arun_on_dataset("
)

@ -20,9 +20,12 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None:
# Assert that all runs completed, all feedback completed, and that the
# chain or llm passes for the feedback provided.
runs = list(client.list_runs(project_name=_project_name, execution_order=1))
assert len(runs) == 4
if not runs:
# Queue delays. We are mainly just smoke checking rn.
return
feedback = list(client.list_feedback(run_ids=[run.id for run in runs]))
assert len(feedback) == 8
if not feedback:
return
assert all([f.score == 1 for f in feedback])
@ -80,7 +83,12 @@ def test_chat_model(
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
@ -88,15 +96,20 @@ def test_chat_model(
with pytest.raises(
InputFormatError, match="Example inputs do not match language model"
):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
)
def input_mapper(d: dict) -> List[BaseMessage]:
return [HumanMessage(content=d["some_input"])]
run_on_dataset(
client,
kv_dataset_name,
llm,
client=client,
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name,
@ -109,7 +122,12 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
@ -117,15 +135,20 @@ def test_llm(kv_dataset_name: str, eval_project_name: str, client: Client) -> No
with pytest.raises(
InputFormatError, match="Example inputs do not match language model"
):
run_on_dataset(client, kv_dataset_name, llm, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
)
def input_mapper(d: dict) -> str:
return d["some_input"]
run_on_dataset(
client,
kv_dataset_name,
llm,
client=client,
dataset_name=kv_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
input_mapper=input_mapper,
project_name=eval_project_name,
@ -139,7 +162,12 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
with pytest.raises(ValueError, match="Must specify reference_key"):
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: chain,
evaluation=eval_config,
client=client,
)
eval_config = RunEvalConfig(
evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA],
reference_key="some_output",
@ -147,7 +175,12 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
with pytest.raises(
InputFormatError, match="Example inputs do not match chain input keys"
):
run_on_dataset(client, kv_dataset_name, lambda: chain, evaluation=eval_config)
run_on_dataset(
dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: chain,
evaluation=eval_config,
client=client,
)
def input_mapper(d: dict) -> dict:
return {"input": d["some_input"]}
@ -157,22 +190,20 @@ def test_chain(kv_dataset_name: str, eval_project_name: str, client: Client) ->
match=" match the chain's expected input keys.",
):
run_on_dataset(
client,
kv_dataset_name,
lambda: chain,
dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: input_mapper | chain,
client=client,
evaluation=eval_config,
input_mapper=input_mapper,
)
def right_input_mapper(d: dict) -> dict:
return {"question": d["some_input"]}
run_on_dataset(
client,
kv_dataset_name,
lambda: chain,
dataset_name=kv_dataset_name,
llm_or_chain_factory=lambda: right_input_mapper | chain,
client=client,
evaluation=eval_config,
input_mapper=right_input_mapper,
project_name=eval_project_name,
tags=["shouldpass"],
)
@ -230,10 +261,10 @@ def test_chat_model_on_chat_dataset(
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
chat_dataset_name,
llm,
dataset_name=chat_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
project_name=eval_project_name,
)
_check_all_feedback_passed(eval_project_name, client)
@ -245,9 +276,9 @@ def test_llm_on_chat_dataset(
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
chat_dataset_name,
llm,
dataset_name=chat_dataset_name,
llm_or_chain_factory=llm,
client=client,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -263,9 +294,9 @@ def test_chain_on_chat_dataset(chat_dataset_name: str, client: Client) -> None:
ValueError, match="Cannot evaluate a chain on dataset with data_type=chat"
):
run_on_dataset(
client,
chat_dataset_name,
lambda: chain,
dataset_name=chat_dataset_name,
client=client,
llm_or_chain_factory=lambda: chain,
evaluation=eval_config,
)
@ -308,9 +339,9 @@ def test_chat_model_on_llm_dataset(
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
llm_dataset_name,
llm,
client=client,
dataset_name=llm_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -324,9 +355,9 @@ def test_llm_on_llm_dataset(
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
llm_dataset_name,
llm,
client=client,
dataset_name=llm_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -342,9 +373,9 @@ def test_chain_on_llm_dataset(llm_dataset_name: str, client: Client) -> None:
ValueError, match="Cannot evaluate a chain on dataset with data_type=llm"
):
run_on_dataset(
client,
llm_dataset_name,
lambda: chain,
client=client,
dataset_name=llm_dataset_name,
llm_or_chain_factory=lambda: chain,
evaluation=eval_config,
)
@ -386,10 +417,10 @@ def test_chat_model_on_kv_singleio_dataset(
llm = ChatOpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
llm,
dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=llm,
evaluation=eval_config,
client=client,
project_name=eval_project_name,
tags=["shouldpass"],
)
@ -402,9 +433,9 @@ def test_llm_on_kv_singleio_dataset(
llm = OpenAI(temperature=0)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
llm,
dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=llm,
client=client,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -419,9 +450,9 @@ def test_chain_on_kv_singleio_dataset(
chain = LLMChain.from_string(llm, "The answer to the {question} is: ")
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
run_on_dataset(
client,
kv_singleio_dataset_name,
lambda: chain,
dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=lambda: chain,
client=client,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -439,9 +470,9 @@ async def test_runnable_on_kv_singleio_dataset(
)
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
await arun_on_dataset(
client,
kv_singleio_dataset_name,
runnable,
dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=runnable,
client=client,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],
@ -463,9 +494,9 @@ async def test_arb_func_on_kv_singleio_dataset(
eval_config = RunEvalConfig(evaluators=[EvaluatorType.QA, EvaluatorType.CRITERIA])
await arun_on_dataset(
client,
kv_singleio_dataset_name,
my_func,
dataset_name=kv_singleio_dataset_name,
llm_or_chain_factory=my_func,
client=client,
evaluation=eval_config,
project_name=eval_project_name,
tags=["shouldpass"],

Loading…
Cancel
Save