Add examples to docstrings (#7796)

and:
- remove dataset name from autogenerated project name
- print out project name to view
pull/7805/head
William FH 1 year ago committed by GitHub
parent ed97af423c
commit c58d35765d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -10,7 +10,7 @@
"\n",
"One automated way to predict the preferred configuration is to use a `PairwiseStringEvaluator` like the `PairwiseStringEvalChain`<a name=\"cite_ref-1\"></a>[<sup>[1]</sup>](#cite_note-1). This chain prompts an LLM to select which output is preferred, given a specific input.\n",
"\n",
"For this evalution, we will need 3 things:\n",
"For this evaluation, we will need 3 things:\n",
"1. An evaluator\n",
"2. A dataset of inputs\n",
"3. 2 (or more) LLMs, Chains, or Agents to compare\n",

@ -19,9 +19,10 @@ from typing import (
Tuple,
Union,
)
from urllib.parse import urlparse, urlunparse
from langsmith import Client, RunEvaluator
from langsmith.schemas import DataType, Example, RunTypeEnum
from langsmith.schemas import Dataset, DataType, Example, RunTypeEnum
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import Callbacks
@ -50,6 +51,19 @@ class InputFormatError(Exception):
## Shared Utilities
def _get_eval_project_url(api_url: str, project_id: str) -> str:
"""Get the project url from the api url."""
parsed = urlparse(api_url)
hostname = parsed.hostname or ""
if "api." in hostname:
hostname = hostname.replace("api.", "", 1)
if "localhost" in hostname:
# Remove the port
hostname = "localhost"
url = urlunparse(parsed._replace(netloc=hostname))
return f"{url}/projects/p/{project_id}?eval=true"
def _wrap_in_chain_factory(
llm_or_chain_factory: Union[Chain, MODEL_OR_CHAIN_FACTORY],
dataset_name: str = "<my_dataset>",
@ -206,7 +220,6 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]:
def _get_project_name(
project_name: Optional[str],
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
dataset_name: Optional[str],
) -> str:
"""
Get the project name.
@ -214,7 +227,6 @@ def _get_project_name(
Args:
project_name: The project name if manually specified.
llm_or_chain_factory: The Chain or language model constructor.
dataset_name: The dataset name.
Returns:
The project name.
@ -226,8 +238,7 @@ def _get_project_name(
model_name = llm_or_chain_factory.__class__.__name__
else:
model_name = llm_or_chain_factory().__class__.__name__
dataset_prefix = f"{dataset_name}-" if dataset_name else ""
return f"{dataset_prefix}{model_name}-{current_time}"
return f"{current_time}-{model_name}"
## Shared Validation Utilities
@ -801,7 +812,7 @@ async def _arun_on_examples(
A dictionary mapping example ids to the model outputs.
"""
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
project_name = _get_project_name(project_name, llm_or_chain_factory)
run_evaluators, examples = _setup_evaluation(
llm_or_chain_factory, examples, evaluation, data_type
)
@ -1033,7 +1044,7 @@ def _run_on_examples(
"""
results: Dict[str, Any] = {}
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory)
project_name = _get_project_name(project_name, llm_or_chain_factory, None)
project_name = _get_project_name(project_name, llm_or_chain_factory)
tracer = LangChainTracer(
project_name=project_name, client=client, use_threading=False
)
@ -1068,6 +1079,31 @@ def _run_on_examples(
## Public API
def _prepare_eval_run(
client: Client,
dataset_name: str,
llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
project_name: Optional[str],
) -> Tuple[MODEL_OR_CHAIN_FACTORY, str, Dataset, Iterator[Example]]:
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
project_name = _get_project_name(project_name, llm_or_chain_factory)
try:
project = client.create_project(project_name)
except ValueError as e:
if "already exists " not in str(e):
raise e
raise ValueError(
f"Project {project_name} already exists. Please use a different name."
)
project_url = _get_eval_project_url(client.api_url, project.id)
print(
f"View the evaluation results for project '{project_name}' at:\n{project_url}"
)
dataset = client.read_dataset(dataset_name=dataset_name)
examples = client.list_examples(dataset_id=str(dataset.id))
return llm_or_chain_factory, project_name, dataset, examples
async def arun_on_dataset(
client: Client,
dataset_name: str,
@ -1110,11 +1146,90 @@ async def arun_on_dataset(
Returns:
A dictionary containing the run's project name and the
resulting model outputs.
"""
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client.read_dataset(dataset_name=dataset_name)
examples = client.list_examples(dataset_id=str(dataset.id))
For the synchronous version, see :func:`run_on_dataset`.
Examples
--------
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import RunEvalConfig, arun_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
RunEvalConfig.Criteria("helpfulness"),
RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
await arun_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
)
await arun_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
""" # noqa: E501
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
client, dataset_name, llm_or_chain_factory, project_name
)
results = await _arun_on_examples(
client,
examples,
@ -1174,11 +1289,91 @@ def run_on_dataset(
Returns:
A dictionary containing the run's project name and the resulting model outputs.
"""
llm_or_chain_factory = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name)
project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
dataset = client.read_dataset(dataset_name=dataset_name)
examples = client.list_examples(dataset_id=str(dataset.id))
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
Examples
--------
.. code-block:: python
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.smith import RunEvalConfig, run_on_dataset
# Chains may have memory. Passing in a constructor function lets the
# evaluation framework avoid cross-contamination between runs.
def construct_chain():
llm = ChatOpenAI(temperature=0)
chain = LLMChain.from_string(
llm,
"What's the answer to {your_input_key}"
)
return chain
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
evaluation_config = RunEvalConfig(
evaluators=[
"qa", # "Correctness" against a reference answer
"embedding_distance",
RunEvalConfig.Criteria("helpfulness"),
RunEvalConfig.Criteria({
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
}),
]
)
client = Client()
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.
.. code-block:: python
from typing import Optional
from langchain.evaluation import StringEvaluator
class MyStringEvaluator(StringEvaluator):
@property
def requires_input(self) -> bool:
return False
@property
def requires_reference(self) -> bool:
return True
@property
def evaluation_name(self) -> str:
return "exact_match"
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
return {"score": prediction == reference}
evaluation_config = RunEvalConfig(
custom_evaluators = [MyStringEvaluator()],
)
run_on_dataset(
client,
"<my_dataset_name>",
construct_chain,
evaluation=evaluation_config,
)
""" # noqa: E501
llm_or_chain_factory, project_name, dataset, examples = _prepare_eval_run(
client, dataset_name, llm_or_chain_factory, project_name
)
results = _run_on_examples(
client,
examples,

@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
]
def mock_create_project(*args: Any, **kwargs: Any) -> None:
pass
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
proj = mock.MagicMock()
proj.id = "123"
return proj
with mock.patch.object(
Client, "read_dataset", new=mock_read_dataset

@ -313,8 +313,10 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
{"result": f"Result for example {example.id}"} for _ in range(n_repetitions)
]
def mock_create_project(*args: Any, **kwargs: Any) -> None:
pass
def mock_create_project(*args: Any, **kwargs: Any) -> Any:
proj = mock.MagicMock()
proj.id = "123"
return proj
with mock.patch.object(
Client, "read_dataset", new=mock_read_dataset

Loading…
Cancel
Save