Update to RunOnDataset helper functions to accept evaluator callbacks (#6629)

Also improve docstrings and update the tracing datasets notebook to focus on "debug, evaluate, monitor"
2024-11-06 03:20:49 +00:00 · 2023-06-26 23:58:13 -07:00 · 2023-06-26 23:58:13 -07:00 · 6ca383ecf6
commit 6ca383ecf6
parent 7ac9b22886
6 changed files with 1054 additions and 751 deletions
--- a/langchain/callbacks/tracers/evaluation.py
+++ b/langchain/callbacks/tracers/evaluation.py
@ -0,0 +1,84 @@
+"""A tracer that runs evaluators over completed runs."""
+from concurrent.futures import Future, ThreadPoolExecutor, wait
+from typing import Any, Optional, Sequence, Set, Union
+from uuid import UUID
+
+from langchainplus_sdk import LangChainPlusClient, RunEvaluator
+
+from langchain.callbacks.tracers.base import BaseTracer
+from langchain.callbacks.tracers.schemas import Run
+
+
+class EvaluatorCallbackHandler(BaseTracer):
+    """A tracer that runs a run evaluator whenever a run is persisted.
+
+    Parameters
+    ----------
+    evaluators : Sequence[RunEvaluator]
+        The run evaluators to apply to all top level runs.
+    max_workers : int, optional
+        The maximum number of worker threads to use for running the evaluators.
+        If not specified, it will default to the number of evaluators.
+    client : LangChainPlusClient, optional
+        The LangChainPlusClient instance to use for evaluating the runs.
+        If not specified, a new instance will be created.
+    example_id : Union[UUID, str], optional
+        The example ID to be associated with the runs.
+
+    Attributes
+    ----------
+    example_id : Union[UUID, None]
+        The example ID associated with the runs.
+    client : LangChainPlusClient
+        The LangChainPlusClient instance used for evaluating the runs.
+    evaluators : Sequence[RunEvaluator]
+        The sequence of run evaluators to be executed.
+    executor : ThreadPoolExecutor
+        The thread pool executor used for running the evaluators.
+    futures : Set[Future]
+        The set of futures representing the running evaluators.
+    """
+
+    name = "evaluator_callback_handler"
+
+    def __init__(
+        self,
+        evaluators: Sequence[RunEvaluator],
+        max_workers: Optional[int] = None,
+        client: Optional[LangChainPlusClient] = None,
+        example_id: Optional[Union[UUID, str]] = None,
+        **kwargs: Any
+    ) -> None:
+        super().__init__(**kwargs)
+        self.example_id = (
+            UUID(example_id) if isinstance(example_id, str) else example_id
+        )
+        self.client = client or LangChainPlusClient()
+        self.evaluators = evaluators
+        self.executor = ThreadPoolExecutor(
+            max_workers=max(max_workers or len(evaluators), 1)
+        )
+        self.futures: Set[Future] = set()
+
+    def _persist_run(self, run: Run) -> None:
+        """Run the evaluator on the run.
+
+        Parameters
+        ----------
+        run : Run
+            The run to be evaluated.
+
+        """
+        run_ = run.copy()
+        run_.reference_example_id = self.example_id
+        for evaluator in self.evaluators:
+            self.futures.add(
+                self.executor.submit(self.client.evaluate_run, run_, evaluator)
+            )
+
+    def wait_for_futures(self) -> None:
+        """Wait for all futures to complete."""
+        futures = list(self.futures)
+        wait(futures)
+        for future in futures:
+            self.futures.remove(future)
--- a/langchain/callbacks/tracers/run_collector.py
+++ b/langchain/callbacks/tracers/run_collector.py
@ -1,20 +1,52 @@
 """A tracer that collects all nested runs in a list."""
-from typing import Any, List
+
+from typing import Any, List, Optional, Union
+from uuid import UUID

 from langchain.callbacks.tracers.base import BaseTracer
 from langchain.callbacks.tracers.schemas import Run


 class RunCollectorCallbackHandler(BaseTracer):
-    """A tracer that collects all nested runs in a list.
+    """
+    A tracer that collects all nested runs in a list.

-    Useful for inspection and for evaluation."""
+    This tracer is useful for inspection and evaluation purposes.
+
+    Parameters
+    ----------
+    example_id : Optional[Union[UUID, str]], default=None
+        The ID of the example being traced. It can be either a UUID or a string.
+    """

    name = "run-collector_callback_handler"

-    def __init__(self, **kwargs: Any) -> None:
+    def __init__(
+        self, example_id: Optional[Union[UUID, str]] = None, **kwargs: Any
+    ) -> None:
+        """
+        Initialize the RunCollectorCallbackHandler.
+
+        Parameters
+        ----------
+        example_id : Optional[Union[UUID, str]], default=None
+            The ID of the example being traced. It can be either a UUID or a string.
+        """
        super().__init__(**kwargs)
+        self.example_id = (
+            UUID(example_id) if isinstance(example_id, str) else example_id
+        )
        self.traced_runs: List[Run] = []

    def _persist_run(self, run: Run) -> None:
-        self.traced_runs.append(run)
+        """
+        Persist a run by adding it to the traced_runs list.
+
+        Parameters
+        ----------
+        run : Run
+            The run to be persisted.
+        """
+        run_ = run.copy()
+        run_.reference_example_id = self.example_id
+        self.traced_runs.append(run_)
--- a/langchain/client/runner_utils.py
+++ b/langchain/client/runner_utils.py
@ -1,4 +1,5 @@
-"""Utilities for running LLMs/Chains over datasets."""
+"""Utilities for running language models or Chains over datasets."""
+
 from __future__ import annotations

 import asyncio
@ -13,15 +14,18 @@ from typing import (
    Iterator,
    List,
    Optional,
+    Sequence,
    Union,
 )

-from langchainplus_sdk import LangChainPlusClient
+from langchainplus_sdk import LangChainPlusClient, RunEvaluator
 from langchainplus_sdk.schemas import Example

 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import Callbacks
+from langchain.callbacks.tracers.base import BaseTracer
+from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
 from langchain.callbacks.tracers.langchain import LangChainTracer
 from langchain.chains.base import Chain
 from langchain.chat_models.base import BaseChatModel
@ -41,11 +45,21 @@ MODEL_OR_CHAIN_FACTORY = Union[Callable[[], Chain], BaseLanguageModel]


 class InputFormatError(Exception):
-    """Raised when input format is invalid."""
+    """Raised when the input format is invalid."""


 def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
-    """Get prompts from inputs."""
+    """
+    Get prompts from inputs.
+
+    Args:
+        inputs: The input dictionary.
+
+    Returns:
+        A list of prompts.
+    Raises:
+        InputFormatError: If the input format is invalid.
+    """
    if not inputs:
        raise InputFormatError("Inputs should not be empty.")

@ -83,7 +97,17 @@ def _get_prompts(inputs: Dict[str, Any]) -> List[str]:


 def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
-    """Get Chat Messages from inputs."""
+    """
+    Get Chat Messages from inputs.
+
+    Args:
+        inputs: The input dictionary.
+
+    Returns:
+        A list of chat messages.
+    Raises:
+        InputFormatError: If the input format is invalid.
+    """
    if not inputs:
        raise InputFormatError("Inputs should not be empty.")

@ -112,13 +136,25 @@ def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
 async def _arun_llm(
    llm: BaseLanguageModel,
    inputs: Dict[str, Any],
-    langchain_tracer: Optional[LangChainTracer],
    *,
    tags: Optional[List[str]] = None,
+    callbacks: Callbacks = None,
 ) -> Union[LLMResult, ChatResult]:
-    callbacks: Optional[List[BaseCallbackHandler]] = (
-        [langchain_tracer] if langchain_tracer else None
-    )
+    """
+    Asynchronously run the language model.
+
+    Args:
+        llm: The language model to run.
+        inputs: The input dictionary.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        The LLMResult or ChatResult.
+    Raises:
+        ValueError: If the LLM type is unsupported.
+        InputFormatError: If the input format is invalid.
+    """
    if isinstance(llm, BaseLLM):
        try:
            llm_prompts = _get_prompts(inputs)
@ -152,18 +188,32 @@ async def _arun_llm_or_chain(
    example: Example,
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
    n_repetitions: int,
-    langchain_tracer: Optional[LangChainTracer],
    *,
    tags: Optional[List[str]] = None,
+    callbacks: Optional[List[BaseCallbackHandler]] = None,
 ) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """Run the chain asynchronously."""
-    if langchain_tracer is not None:
-        previous_example_id = langchain_tracer.example_id
-        langchain_tracer.example_id = example.id
-        callbacks: Optional[List[BaseCallbackHandler]] = [langchain_tracer]
+    """
+    Asynchronously run the Chain or language model.
+
+    Args:
+        example: The example to run.
+        llm_or_chain_factory: The Chain or language model constructor to run.
+        n_repetitions: The number of times to run the model on each example.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        A list of outputs.
+    """
+    if callbacks:
+        previous_example_ids = [
+            getattr(tracer, "example_id", None) for tracer in callbacks
+        ]
+        for tracer in callbacks:
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example.id
    else:
-        previous_example_id = None
-        callbacks = None
+        previous_example_ids = None
    outputs = []
    for _ in range(n_repetitions):
        try:
@ -171,8 +221,8 @@ async def _arun_llm_or_chain(
                output: Any = await _arun_llm(
                    llm_or_chain_factory,
                    example.inputs,
-                    langchain_tracer,
                    tags=tags,
+                    callbacks=callbacks,
                )
            else:
                chain = llm_or_chain_factory()
@ -183,15 +233,19 @@ async def _arun_llm_or_chain(
        except Exception as e:
            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
            outputs.append({"Error": str(e)})
-    if langchain_tracer is not None:
-        langchain_tracer.example_id = previous_example_id
+    if callbacks and previous_example_ids:
+        for example_id, tracer in zip(previous_example_ids, callbacks):
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example_id
    return outputs


 async def _gather_with_concurrency(
    n: int,
-    initializer: Callable[[], Coroutine[Any, Any, Optional[LangChainTracer]]],
-    *async_funcs: Callable[[Optional[LangChainTracer], Dict], Coroutine[Any, Any, Any]],
+    initializer: Callable[[], Coroutine[Any, Any, Any]],
+    *async_funcs: Callable[
+        [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
+    ],
 ) -> List[Any]:
    """
    Run coroutines with a concurrency limit.
@ -207,37 +261,42 @@ async def _gather_with_concurrency(
    semaphore = asyncio.Semaphore(n)
    job_state = {"num_processed": 0}

-    tracer_queue: asyncio.Queue[Optional[LangChainTracer]] = asyncio.Queue()
+    callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue()
    for _ in range(n):
-        tracer_queue.put_nowait(await initializer())
+        callback_queue.put_nowait(await initializer())

    async def run_coroutine_with_semaphore(
        async_func: Callable[
-            [Optional[LangChainTracer], Dict], Coroutine[Any, Any, Any]
+            [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
        ]
    ) -> Any:
        async with semaphore:
-            tracer = await tracer_queue.get()
+            callbacks = await callback_queue.get()
            try:
-                result = await async_func(tracer, job_state)
+                result = await async_func(callbacks, job_state)
            finally:
-                tracer_queue.put_nowait(tracer)
+                callback_queue.put_nowait(callbacks)
            return result

    results = await asyncio.gather(
        *(run_coroutine_with_semaphore(function) for function in async_funcs)
    )
-    while tracer_queue:
+    while callback_queue:
        try:
-            tracer = tracer_queue.get_nowait()
+            callbacks = callback_queue.get_nowait()
        except asyncio.QueueEmpty:
            break
-        if tracer:
-            tracer.wait_for_futures()
+        for callback in callbacks:
+            if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)):
+                callback.wait_for_futures()
    return results


-async def _tracer_initializer(project_name: Optional[str]) -> Optional[LangChainTracer]:
+async def _callbacks_initializer(
+    project_name: Optional[str],
+    client: LangChainPlusClient,
+    run_evaluators: Sequence[RunEvaluator],
+) -> List[BaseTracer]:
    """
    Initialize a tracer to share across tasks.

@ -247,11 +306,19 @@ async def _tracer_initializer(project_name: Optional[str]) -> Optional[LangChain
    Returns:
        A LangChainTracer instance with an active project.
    """
+    callbacks: List[BaseTracer] = []
    if project_name:
-        tracer = LangChainTracer(project_name=project_name)
-        return tracer
-    else:
-        return None
+        callbacks.append(LangChainTracer(project_name=project_name))
+    if run_evaluators:
+        callbacks.append(
+            EvaluatorCallbackHandler(
+                client=client,
+                evaluators=run_evaluators,
+                # We already have concurrency, don't want to overload the machine
+                max_workers=1,
+            )
+        )
+    return callbacks


 async def arun_on_examples(
@ -262,13 +329,16 @@ async def arun_on_examples(
    num_repetitions: int = 1,
    project_name: Optional[str] = None,
    verbose: bool = False,
+    client: Optional[LangChainPlusClient] = None,
    tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
    """
-    Run the chain on examples and store traces to the specified project name.
+    Asynchronously run the chain on examples and store traces
+        to the specified project name.

    Args:
-        examples: Examples to run the model or chain over
+        examples: Examples to run the model or chain over.
        llm_or_chain_factory: Language model or Chain constructor to run
            over the dataset. The Chain constructor is used to permit
            independent calls on each example without carrying over state.
@ -277,24 +347,35 @@ async def arun_on_examples(
            This is useful when testing success rates or generating confidence
            intervals.
        project_name: Project name to use when tracing runs.
+            Defaults to {dataset_name}-{chain class name}-{datetime}.
        verbose: Whether to print progress.
-        tags: Tags to add to the traces.
+        client: Client to use to read the dataset. If not provided, a new
+            client will be created using the credentials in the environment.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.

    Returns:
        A dictionary mapping example ids to the model outputs.
    """
+    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    client_ = client or LangChainPlusClient()
+    client_.create_project(project_name, mode="eval")
+
    results: Dict[str, List[Any]] = {}
+    evaluation_handler = EvaluatorCallbackHandler(
+        evaluators=run_evaluators or [], client=client_
+    )

    async def process_example(
-        example: Example, tracer: Optional[LangChainTracer], job_state: dict
+        example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
    ) -> None:
        """Process a single example."""
        result = await _arun_llm_or_chain(
            example,
            llm_or_chain_factory,
            num_repetitions,
-            tracer,
            tags=tags,
+            callbacks=callbacks,
        )
        results[str(example.id)] = result
        job_state["num_processed"] += 1
@ -307,9 +388,15 @@ async def arun_on_examples(

    await _gather_with_concurrency(
        concurrency_level,
-        functools.partial(_tracer_initializer, project_name),
+        functools.partial(
+            _callbacks_initializer,
+            project_name=project_name,
+            client=client_,
+            run_evaluators=run_evaluators or [],
+        ),
        *(functools.partial(process_example, e) for e in examples),
    )
+    evaluation_handler.wait_for_futures()
    return results


@ -320,7 +407,21 @@ def run_llm(
    *,
    tags: Optional[List[str]] = None,
 ) -> Union[LLMResult, ChatResult]:
-    """Run the language model on the example."""
+    """
+    Run the language model on the example.
+
+    Args:
+        llm: The language model to run.
+        inputs: The input dictionary.
+        callbacks: The callbacks to use during the run.
+        tags: Optional tags to add to the run.
+
+    Returns:
+        The LLMResult or ChatResult.
+    Raises:
+        ValueError: If the LLM type is unsupported.
+        InputFormatError: If the input format is invalid.
+    """
    if isinstance(llm, BaseLLM):
        try:
            llm_prompts = _get_prompts(inputs)
@ -350,18 +451,32 @@ def run_llm_or_chain(
    example: Example,
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
    n_repetitions: int,
-    langchain_tracer: Optional[LangChainTracer] = None,
    *,
    tags: Optional[List[str]] = None,
+    callbacks: Optional[List[BaseCallbackHandler]] = None,
 ) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """Run the chain synchronously."""
-    if langchain_tracer is not None:
-        previous_example_id = langchain_tracer.example_id
-        langchain_tracer.example_id = example.id
-        callbacks: Optional[List[BaseCallbackHandler]] = [langchain_tracer]
+    """
+    Run the Chain or language model synchronously.
+
+    Args:
+        example: The example to run.
+        llm_or_chain_factory: The Chain or language model constructor to run.
+        n_repetitions: The number of times to run the model on each example.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        A list of outputs.
+    """
+    if callbacks:
+        previous_example_ids = [
+            getattr(tracer, "example_id", None) for tracer in callbacks
+        ]
+        for tracer in callbacks:
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example.id
    else:
-        previous_example_id = None
-        callbacks = None
+        previous_example_ids = None
    outputs = []
    for _ in range(n_repetitions):
        try:
@ -376,8 +491,10 @@ def run_llm_or_chain(
        except Exception as e:
            logger.warning(f"Chain failed for example {example.id}. Error: {e}")
            outputs.append({"Error": str(e)})
-    if langchain_tracer is not None:
-        langchain_tracer.example_id = previous_example_id
+    if callbacks and previous_example_ids:
+        for example_id, tracer in zip(previous_example_ids, callbacks):
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example_id
    return outputs


@ -388,48 +505,74 @@ def run_on_examples(
    num_repetitions: int = 1,
    project_name: Optional[str] = None,
    verbose: bool = False,
+    client: Optional[LangChainPlusClient] = None,
    tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
-    """Run the chain on examples and store traces to the specified project name.
+    """
+    Run the Chain or language model on examples and store
+    traces to the specified project name.

    Args:
-        examples: Examples to run model or chain over.
+        examples: Examples to run the model or chain over.
        llm_or_chain_factory: Language model or Chain constructor to run
            over the dataset. The Chain constructor is used to permit
            independent calls on each example without carrying over state.
-        concurrency_level: Number of async workers to run in parallel.
        num_repetitions: Number of times to run the model on each example.
            This is useful when testing success rates or generating confidence
            intervals.
-        project_name: Project name to use when tracing runs.
+        project_name: Name of the project to store the traces in.
+            Defaults to {dataset_name}-{chain class name}-{datetime}.
        verbose: Whether to print progress.
-        tags: Tags to add to the run traces.
+        client: Client to use to access the dataset. If None, a new client
+            will be created using the credentials in the environment.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.
+
    Returns:
        A dictionary mapping example ids to the model outputs.
    """
    results: Dict[str, Any] = {}
-    tracer = LangChainTracer(project_name=project_name) if project_name else None
+    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    client_ = client or LangChainPlusClient()
+    client_.create_project(project_name, mode="eval")
+    tracer = LangChainTracer(project_name=project_name)
+    evalution_handler = EvaluatorCallbackHandler(
+        evaluators=run_evaluators or [], client=client_
+    )
+    callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
    for i, example in enumerate(examples):
        result = run_llm_or_chain(
            example,
            llm_or_chain_factory,
            num_repetitions,
-            langchain_tracer=tracer,
            tags=tags,
+            callbacks=callbacks,
        )
        if verbose:
            print(f"{i+1} processed", flush=True, end="\r")
        results[str(example.id)] = result
-    if tracer:
    tracer.wait_for_futures()
+    evalution_handler.wait_for_futures()
    return results


 def _get_project_name(
    project_name: Optional[str],
    llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    dataset_name: str,
+    dataset_name: Optional[str],
 ) -> str:
+    """
+    Get the project name.
+
+    Args:
+        project_name: The project name if manually specified.
+        llm_or_chain_factory: The Chain or language model constructor.
+        dataset_name: The dataset name.
+
+    Returns:
+        The project name.
+    """
    if project_name is not None:
        return project_name
    current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
@ -437,7 +580,8 @@ def _get_project_name(
        model_name = llm_or_chain_factory.__class__.__name__
    else:
        model_name = llm_or_chain_factory().__class__.__name__
-    return f"{dataset_name}-{model_name}-{current_time}"
+    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
+    return f"{dataset_prefix}{model_name}-{current_time}"


 async def arun_on_dataset(
@ -450,12 +594,13 @@ async def arun_on_dataset(
    verbose: bool = False,
    client: Optional[LangChainPlusClient] = None,
    tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
    """
-    Run the chain on a dataset and store traces to the specified project name.
+    Asynchronously run the Chain or language model on a dataset
+    and store traces to the specified project name.

    Args:
-        client: Client to use to read the dataset.
        dataset_name: Name of the dataset to run the chain on.
        llm_or_chain_factory: Language model or Chain constructor to run
            over the dataset. The Chain constructor is used to permit
@ -469,7 +614,8 @@ async def arun_on_dataset(
        verbose: Whether to print progress.
        client: Client to use to read the dataset. If not provided, a new
            client will be created using the credentials in the environment.
-        tags: Tags to add to each run in the sesssion.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.

    Returns:
        A dictionary containing the run's project name and the resulting model outputs.
@ -478,7 +624,6 @@ async def arun_on_dataset(
    project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
    dataset = client_.read_dataset(dataset_name=dataset_name)
    examples = client_.list_examples(dataset_id=str(dataset.id))
-
    results = await arun_on_examples(
        examples,
        llm_or_chain_factory,
@ -486,7 +631,9 @@ async def arun_on_dataset(
        num_repetitions=num_repetitions,
        project_name=project_name,
        verbose=verbose,
+        client=client_,
        tags=tags,
+        run_evaluators=run_evaluators,
    )
    return {
        "project_name": project_name,
@ -503,8 +650,11 @@ def run_on_dataset(
    verbose: bool = False,
    client: Optional[LangChainPlusClient] = None,
    tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
-    """Run the chain on a dataset and store traces to the specified project name.
+    """
+    Run the Chain or language model on a dataset and store traces
+    to the specified project name.

    Args:
        dataset_name: Name of the dataset to run the chain on.
@ -520,7 +670,8 @@ def run_on_dataset(
        verbose: Whether to print progress.
        client: Client to use to access the dataset. If None, a new client
            will be created using the credentials in the environment.
-        tags: Tags to add to each run in the sesssion.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.

    Returns:
        A dictionary containing the run's project name and the resulting model outputs.
@ -536,6 +687,8 @@ def run_on_dataset(
        project_name=project_name,
        verbose=verbose,
        tags=tags,
+        run_evaluators=run_evaluators,
+        client=client_,
    )
    return {
        "project_name": project_name,
--- a/langchain/evaluation/run_evaluators/implementations.py
+++ b/langchain/evaluation/run_evaluators/implementations.py
@ -117,10 +117,12 @@ def get_qa_evaluator(
            choices_map={"CORRECT": 1, "INCORRECT": 0},
        ),
    )
+    tags = kwargs.pop("tags", [])
    return RunEvaluatorChain(
        eval_chain=eval_chain,
        input_mapper=input_mapper,
        output_parser=output_parser,
+        tags=tags + [evaluation_name],
        **kwargs,
    )

@ -174,6 +176,7 @@ def get_criteria_evaluator(
            choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
        ),
    )
+    tags = kwargs.pop("tags", [])
    eval_chain = CriteriaEvalChain.from_llm(
        llm=llm, criteria=criteria_, prompt=prompt, **kwargs
    )
@ -181,6 +184,7 @@ def get_criteria_evaluator(
        eval_chain=eval_chain,
        input_mapper=input_mapper,
        output_parser=parser,
+        tags=tags + [evaluation_name],
        **kwargs,
    )

@ -303,9 +307,11 @@ def get_trajectory_evaluator(
        TrajectoryEvalOutputParser(evaluation_name=evaluation_name),
    )
    eval_chain = LLMChain(llm=llm, prompt=prompt, **kwargs)
+    tags = kwargs.pop("tags", [])
    return RunEvaluatorChain(
        eval_chain=eval_chain,
        input_mapper=input_mapper,
        output_parser=parser,
+        tags=tags + [evaluation_name],
        **kwargs,
    )
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@ -1,80 +1,65 @@
 {
 "cells": [
  {
-      "attachments": {},
   "cell_type": "markdown",
   "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
   "metadata": {
    "tags": []
   },
   "source": [
-        "# Tracing and Datasets with LangChainPlus\n",
+    "# Debug, Evaluate, and Monitor LLMs with LangSmith\n",
    "\n",
-        "LangChain makes it easy to get started with Agents and other LLM applications. However, it can be tricky to get right, especially when you need to deliver a full product. To speed up your application development process, and to help monitor your applications in production, LangChain offers additional tracing and tooling.\n",
+    "LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
    "\n",
    "When might you want to use tracing? Some situations we've found it useful include:\n",
    "- Quickly debugging a new chain, agent, or set of tools\n",
    "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
    "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
+    "- Capturing production traces and using LangChain summarizers to analyze app usage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "138fbb8f-960d-4d26-9dd5-6d6acab3ee55",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "**Either [create a hosted LangSmith account](https://www.langchain.plus/) and connect with an API key OR\n",
+    "run the server locally.**\n",
    "\n",
    "\n",
-        "In this notebook, we'll show how to enable tracing in your LangChain applications and walk you a couple common ways to evaluate your agents.\n",
-        "We'll focus on using Datasets to benchmark Chain behavior.\n",
-        "\n",
-        "**Bear in mind that this notebook is designed under the assumption that you're running the latest LangChain+ server locally in the background. This is done using the folowing command in your terminal:**\n",
-        "\n",
-        "\n",
+    "To run the local server, execute the following comand in your terminal:\n",
    "```\n",
    "pip install --upgrade langchain\n",
    "langchain plus start\n",
    "```\n",
    "\n",
-        "We also have a hosted version which is in private beta. We will share more details as it progresses.\n",
-        "\n",
    "Now, let's get started by creating a client to connect to LangChain+."
   ]
  },
  {
-      "attachments": {},
   "cell_type": "markdown",
   "id": "2d77d064-41b4-41fb-82e6-2d16461269ec",
   "metadata": {
    "tags": []
   },
   "source": [
-        "## Setting up Tracing\n",
+    "## Debug your Agent\n",
    "\n",
-        "The V2 tracing API can be activated by setting the `LANGCHAIN_TRACING_V2` environment variable to true. Assuming you've successfully initiated the server as described earlier, running LangChain Agents, Chains, LLMs, and other primitives will automatically start capturing traces. Let's begin our exploration with a straightforward math example.\n",
+    "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
+    "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable. This will automatically create a debug project for you.\n",
+    "\n",
+    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.langchain.plus/docs/)\n",
+    "\n",
+    "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
    "\n",
-        "**NOTE**: You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "7935e832-9ae1-4557-8d08-890c425f18e2",
-      "metadata": {},
-      "source": [
-        "**NOTE:** You can also use the `tracing_v2_enabled` context manager to capture projects within a given context:\n",
-        "```\n",
-        "from langchain.callbacks.manager import tracing_v2_enabled\n",
-        "with tracing_v2_enabled(\"My Project Name\"):\n",
-        "    ...\n",
-        "```"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "221b638a-2ae4-46ef-bf6a-d59bf85d587f",
-      "metadata": {},
-      "source": [
    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": 1,
+   "execution_count": 2,
   "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
   "metadata": {
    "tags": []
@ -96,28 +81,42 @@
       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
      ]
     },
-          "execution_count": 1,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
+    "from uuid import uuid4\n",
    "from langchainplus_sdk import LangChainPlusClient\n",
    "\n",
+    "unique_id = uuid4().hex[0:8]\n",
    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
-        "os.environ[\"LANGCHAIN_PROJECT\"] = \"Tracing Walkthrough\"\n",
-        "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line if you want to use the hosted version\n",
-        "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line if you want to use the hosted version.\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
+    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line to use the hosted version\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
+    "\n",
+    "# Used by the agent below\n",
+    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
+    "# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\"\n",
    "\n",
    "client = LangChainPlusClient()\n",
    "print(\"You can click the link below to view the UI\")\n",
    "client"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
+   "metadata": {},
+   "source": [
+    "Now, start prototyping your agent. We will use a straightforward math example."
+   ]
+  },
  {
   "cell_type": "code",
-      "execution_count": 2,
+   "execution_count": 3,
   "id": "7c801853-8e96-404d-984c-51ace59cbbef",
   "metadata": {
    "tags": []
@ -137,7 +136,7 @@
  },
  {
   "cell_type": "code",
-      "execution_count": 3,
+   "execution_count": 4,
   "id": "19537902-b95c-4390-80a4-f6c9a937081e",
   "metadata": {
    "tags": []
@ -175,19 +174,86 @@
   ]
  },
  {
-      "attachments": {},
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0405ff30-21fe-413d-85cf-9fa3c649efec",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.callbacks.tracers.langchain import wait_for_all_tracers\n",
+    "\n",
+    "# Logs are submitted in a background thread. Make sure they've been submitted before moving on.\n",
+    "wait_for_all_tracers()"
+   ]
+  },
+  {
   "cell_type": "markdown",
-      "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
+   "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
   "metadata": {},
   "source": [
-        "## Creating the Dataset\n",
-        "\n",
-        "Now that you've captured a project entitled 'Tracing Walkthrough', it's time to create a dataset. We will do so using the `create_dataset` method below."
+    "Assuming you've successfully initiated the server as described earlier, your agent logs should show up in your server. You can check by clicking on the link below:"
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": 4,
+   "execution_count": 6,
+   "id": "b7bc3934-bb1a-452c-a723-f9cdb0b416f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
+      ],
+      "text/plain": [
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
+   "metadata": {},
+   "source": [
+    "## Test\n",
+    "\n",
+    "Once you've debugged a prototype of your agent, you will want to create tests and benchmark evaluations as you think about putting it into a production environment.\n",
+    "\n",
+    "In this notebook, you will run evaluators to test an agent. You will do so in a few steps:\n",
+    "\n",
+    "1. Create a dataset\n",
+    "2. Select or create evaluators to measure performance\n",
+    "3. Define the LLM or Chain initializer to test\n",
+    "4. Run the chain and evaluators using the helper functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beab1a29-b79d-4a99-b5b1-0870c2d772b1",
+   "metadata": {},
+   "source": [
+    "### 1. Create Dataset\n",
+    "\n",
+    "Below, use the client to create a dataset from the Agent runs you just logged while debugging above. You will use these later to measure performance.\n",
+    "\n",
+    "For more information on datasets, including how to create them from CSVs or other files or how to create them in the web app, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
   "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
   "metadata": {
    "tags": []
@ -199,7 +265,7 @@
  },
  {
   "cell_type": "code",
-      "execution_count": 5,
+   "execution_count": 8,
   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
   "metadata": {
    "tags": []
@ -211,124 +277,109 @@
    "dataset = client.create_dataset(\n",
    "    dataset_name, description=\"A calculator example dataset\"\n",
    ")\n",
+    "\n",
    "runs = client.list_runs(\n",
    "    project_name=os.environ[\"LANGCHAIN_PROJECT\"],\n",
    "    execution_order=1,  # Only return the top-level runs\n",
    "    error=False,  # Only runs that succeed\n",
    ")\n",
    "for run in runs:\n",
-        "    if run.outputs is None:\n",
-        "        continue\n",
-        "    try:\n",
    "    client.create_example(\n",
    "        inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
-        "        )\n",
-        "    except:\n",
-        "        pass"
+    "    )"
   ]
  },
  {
-      "attachments": {},
   "cell_type": "markdown",
-      "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3",
-      "metadata": {
-        "jp-MarkdownHeadingCollapsed": true,
-        "tags": []
-      },
+   "id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
+   "metadata": {},
   "source": [
-        "**Alternative: Creating a Dataset in the UI** \n",
+    "### 2. Select RunEvaluators\n",
    "\n",
-        "Alternatively, you could create or edit the dataset in the UI using the following steps:\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
    "\n",
-        "   1. Navigate to the UI by clicking on the link below.\n",
-        "   2. Select the 'search_and_math_chain' project from the list.\n",
-        "   3. Next to the fist example, click \"+ to Dataset\".\n",
-        "   4. Click \"Create Dataset\" and create a title **\"calculator-example-dataset\"**.\n",
-        "   5. Add the other examples to the dataset as well\n",
+    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
+    "- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
+    "- Evaluating 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
+    "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
    "\n",
-        "Once you've used LangChain+ for a while, you will have a number of datasets to work with. To view all saved datasets, execute the following code:\n",
+    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
    "\n",
-        "```\n",
-        "datasets = client.list_datasets()\n",
-        "print(datasets)\n",
-        "```\n",
+    "Below, create the run evaluators.\n",
    "\n",
-        "\n",
-        "**Optional:** If you didn't run the trace above, you can also create datasets by uploading dataframes or CSV files."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "id": "1baa677c-5642-4378-8e01-3aa1647f19d6",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# !pip install datasets > /dev/null\n",
-        "# !pip install pandas > /dev/null"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "id": "60d14593-c61f-449f-a38f-772ca43707c2",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# import pandas as pd\n",
-        "# from langchain.evaluation.loading import load_dataset\n",
-        "\n",
-        "# dataset = load_dataset(\"agent-search-calculator\")\n",
-        "# df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n",
-        "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key\n",
-        "# df.head()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "id": "52a7ea76-79ca-4765-abf7-231e884040d6",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# dataset_name = \"calculator-example-dataset\"\n",
-        "\n",
-        "# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-        "#     dataset = client.upload_dataframe(df,\n",
-        "#                             name=dataset_name,\n",
-        "#                             description=\"A calculator example dataset\",\n",
-        "#                             input_keys=[\"input\"],\n",
-        "#                             output_keys=[\"output\"],\n",
-        "#                    )"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "07885b10",
-      "metadata": {
-        "tags": []
-      },
-      "source": [
-        "## Running a Chain on a Traced Dataset\n",
-        "\n",
-        "Once you have a dataset, you can run a compatible chain or other object over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n",
-        "\n",
-        "**First, we'll define the chain we wish to run over the dataset.**\n",
-        "\n",
-        "In this case, we're using an agent, but it can be any simple chain."
+    "**Note: the feedback API is currently experimental and subject to change.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
-      "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b",
+   "id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.run_evaluators import (\n",
+    "    get_qa_evaluator,\n",
+    "    get_criteria_evaluator,\n",
+    "    get_trajectory_evaluator,\n",
+    ")\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "# You can use any model, but stronger llms tend to be more reliable\n",
+    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "\n",
+    "# Measures accuracy against ground truth\n",
+    "qa_evaluator = get_qa_evaluator(eval_llm) \n",
+    "\n",
+    "# Measures how effective and efficient the agent's actions are\n",
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "trajectory_evaluator = get_trajectory_evaluator(eval_llm, agent_tools=tools)\n",
+    "\n",
+    "# Measure helpfulness. We have some pre-defined criteria you can select\n",
+    "helpfulness_evaluator = get_criteria_evaluator(\n",
+    "    eval_llm,\n",
+    "    \"helpfulness\",\n",
+    ")\n",
+    "\n",
+    "# Custom criteria are specified as a dictionary\n",
+    "custom_criteria_evaluator = get_criteria_evaluator(\n",
+    "    eval_llm,\n",
+    "    {\n",
+    "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "evaluators = [\n",
+    "    qa_evaluator,\n",
+    "    trajectory_evaluator,\n",
+    "    helpfulness_evaluator,\n",
+    "    custom_criteria_evaluator,\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### 3. Define the Agent or LLM to Test\n",
+    "\n",
+    "You can evaluate any LLM or chain. Since chains can have memory, we need to pass an\n",
+    "initializer function that returns a new chain for each row.\n",
+    "\n",
+    "In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
   "metadata": {
    "tags": []
   },
@ -338,28 +389,40 @@
    "from langchain.agents import initialize_agent, load_tools\n",
    "from langchain.agents import AgentType\n",
    "\n",
-        "llm = ChatOpenAI(temperature=0)\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-        "agent = initialize_agent(\n",
-        "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
-        ")"
+    "\n",
+    "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
+    "# a way to initialize a new chain for each row in the dataset. This is done\n",
+    "# by passing in a factory function that returns a new chain for each row.\n",
+    "def agent_factory():\n",
+    "    return initialize_agent(\n",
+    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
+    ")\n",
+    "\n",
+    "# If your chain is NOT stateful, your factory can return the object directly\n",
+    "# to improve runtime performance. For example:\n",
+    "# chain_factory = lambda: agent"
   ]
  },
  {
-      "attachments": {},
   "cell_type": "markdown",
-      "id": "84094a4a-1d76-461c-bc37-8c537939b466",
-      "metadata": {},
+   "id": "07885b10",
+   "metadata": {
+    "tags": []
+   },
   "source": [
-        "**Now we're ready to run the chain!**\n",
+    "### 4. Run the Agent and Evaluators\n",
    "\n",
-        "The docstring below hints ways you can configure the method to run."
+    "With the dataset, agent, and evaluators selected, you can use the helper function below to run them all.\n",
+    "\n",
+    "The run traces and evaluation feedback will automatically be associated with the dataset for easy attribution and analysis."
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": null,
-      "id": "112d7bdf-7e50-4c1a-9285-5bac8473f2ee",
+   "execution_count": 11,
+   "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
   "metadata": {
    "tags": []
   },
@ -378,12 +441,13 @@
       "\u001b[0;34m\u001b[0m    \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mrun_evaluators\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Sequence[RunEvaluator]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m\n",
-              "Run the chain on a dataset and store traces to the specified project name.\n",
+       "Asynchronously run the Chain or language model on a dataset\n",
+       "and store traces to the specified project name.\n",
       "\n",
       "Args:\n",
-              "    client: Client to use to read the dataset.\n",
       "    dataset_name: Name of the dataset to run the chain on.\n",
       "    llm_or_chain_factory: Language model or Chain constructor to run\n",
       "        over the dataset. The Chain constructor is used to permit\n",
@ -397,7 +461,8 @@
       "    verbose: Whether to print progress.\n",
       "    client: Client to use to read the dataset. If not provided, a new\n",
       "        client will be created using the credentials in the environment.\n",
-              "    tags: Tags to add to each run in the sesssion.\n",
+       "    tags: Tags to add to each run in the project.\n",
+       "    run_evaluators: Evaluators to run on the results of the chain.\n",
       "\n",
       "Returns:\n",
       "    A dictionary containing the run's project name and the resulting model outputs.\n",
@ -410,57 +475,22 @@
    }
   ],
   "source": [
-        "from langchain.client import arun_on_dataset\n",
+    "from langchain.client import (\n",
+    "    arun_on_dataset,\n",
+    "    run_on_dataset, # Available if your chain doesn't support async calls.\n",
+    ")\n",
    "\n",
    "?arun_on_dataset"
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": 11,
-      "id": "6e10f823",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
-        "# a way to initialize a new chain for each row in the dataset. This is done\n",
-        "# by passing in a factory function that returns a new chain for each row.\n",
-        "chain_factory = lambda: initialize_agent(\n",
-        "    tools,\n",
-        "    llm,\n",
-        "    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
-        "    verbose=False,\n",
-        ")\n",
-        "\n",
-        "# If your chain is NOT stateful, your lambda can return the object directly\n",
-        "# to improve runtime performance. For example:\n",
-        "# chain_factory = lambda: agent"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 13,
+   "execution_count": 12,
   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
   "metadata": {
    "tags": []
   },
   "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Processed examples: 1\r"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Chain failed for example b36a82d3-4fb6-4bc4-87df-b7c355742b8e. Error: unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.\n"
-          ]
-        },
    {
     "name": "stdout",
     "output_type": "stream",
@ -472,14 +502,14 @@
   "source": [
    "chain_results = await arun_on_dataset(\n",
    "    dataset_name=dataset_name,\n",
-        "    llm_or_chain_factory=chain_factory,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
    "    concurrency_level=5,  # Optional, sets the number of examples to run at a time\n",
    "    verbose=True,\n",
    "    client=client,\n",
    "    tags=[\n",
    "        \"testing-notebook\",\n",
-        "        \"turbo\",\n",
    "    ],  # Optional, adds a tag to the resulting chain runs\n",
+    "    run_evaluators=evaluators,\n",
    ")\n",
    "\n",
    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
@ -487,22 +517,23 @@
   ]
  },
  {
-      "attachments": {},
   "cell_type": "markdown",
   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
   "metadata": {
    "tags": []
   },
   "source": [
-        "### Reviewing the Chain Results\n",
+    "### Review the Test Results\n",
    "\n",
-        "You can review the results of the run in the tracing UI below and navigating to the project \n",
-        "with the title **\"Search + Calculator Agent Evaluation\"**"
+    "You can review the test results tracing UI below by navigating to the Testing project \n",
+    "with the title that starts with **\"calculator-example-dataset-AgentExecutor-\"**\n",
+    "\n",
+    "This will show the new runs and the feedback logged from the selected evaluators."
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": 14,
+   "execution_count": 13,
   "id": "136db492-d6ca-4215-96f9-439c23538241",
   "metadata": {
    "tags": []
@ -517,7 +548,7 @@
       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
      ]
     },
-          "execution_count": 14,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -528,126 +559,123 @@
   ]
  },
  {
-      "attachments": {},
   "cell_type": "markdown",
-      "id": "63ed6561-6574-43b3-a653-fe410aa8a617",
+   "id": "5f2c0539-09c1-42f9-a2ee-6a88a378d479",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "For a real production application, you will want to add many more test cases and\n",
+    "incorporate larger datasets to run benchmark evaluations to measure aggregate performance\n",
+    "across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.langchain.plus/docs/)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd67201c-8dc1-4689-981c-759800749e25",
   "metadata": {},
   "source": [
-        "## Running an Evaluation Chain\n",
+    "## Monitor\n",
    "\n",
-        "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
-        "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
+    "Once your agent passed the selected quality bar, you can deploy it to production. For this notebook, you will simulate user interactions directly while logging your traces to LangSmith for monitoring.\n",
    "\n",
-        "A few ways of doing this include:\n",
-        "- Adding ground-truth answers as outputs to the dataset and evaluating relative to those references.\n",
-        "- Evaluating the overall agent trajectory based on the tool usage and intermediate steps.\n",
-        "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
-        "- Evaluating 'aspects' of the agent's response in a reference-free manner using targeted agent prompts.\n",
-        "    \n",
-        "Below, we show how to run an evaluation chain that compares the model output with the ground-truth answers.\n",
+    "For more information on real production deployments, check out the [LangChain documentation](https://python.langchain.com/docs/guides/deployments/) or contact us at [support@langchain.dev](mailto:support@langchain.dev).\n",
    "\n",
-        "**Note: the feedback API is currently experimental and subject to change.**"
+    "**First, create a new project to use in your production deployment.**"
   ]
  },
  {
   "cell_type": "code",
-      "execution_count": 15,
-      "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7",
+   "execution_count": 14,
+   "id": "3718710f-f719-4861-a351-0bb9d639d9fd",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
-        "from langchain.evaluation.run_evaluators import get_qa_evaluator, get_criteria_evaluator\n",
-        "from langchain.chat_models import ChatOpenAI\n",
-        "\n",
-        "eval_llm = ChatOpenAI(temperature=0)\n",
-        "\n",
-        "qa_evaluator = get_qa_evaluator(eval_llm)\n",
-        "helpfulness_evaluator = get_criteria_evaluator(eval_llm, \"helpfulness\")\n",
-        "conciseness_evaluator = get_criteria_evaluator(eval_llm, \"conciseness\")\n",
-        "custom_criteria_evaluator = get_criteria_evaluator(\n",
-        "    eval_llm,\n",
-        "    {\n",
-        "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
-        "    },\n",
-        ")\n",
-        "\n",
-        "evaluators = [\n",
-        "    qa_evaluator,\n",
-        "    helpfulness_evaluator,\n",
-        "    conciseness_evaluator,\n",
-        "    custom_criteria_evaluator,\n",
-        "]"
+    "deployment_name = f\"Search + Calculator Deployment - {unique_id}\"\n",
+    "project = client.create_project(deployment_name, mode=\"monitor\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a993ae7-6d26-495a-8633-64936bf94127",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "**Then, deploy your agent to production, making sure to configure the environment to log to the monitoring project.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "56dba20a-c07c-4b18-a4e7-834ab6dc87ef",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "agent = initialize_agent(\n",
+    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
+    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
-      "id": "4c94a738-dcd3-442e-b8e7-dd36459f56e3",
+   "id": "569389d4-b613-47ce-99d3-e0031f308185",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5fce1ce42a8c4110b7d12443948ac697",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "0it [00:00, ?it/s]"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLMMathChain._evaluate(\"\n",
+      "US_GDP / average_lifespan\n",
+      "\") raised error: 'US_GDP'. Please try again with a valid numerical expression\n"
     ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
    }
   ],
   "source": [
-        "from tqdm.notebook import tqdm\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = deployment_name\n",
    "\n",
-        "feedbacks = []\n",
-        "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1, error=False)\n",
-        "for run in tqdm(runs):\n",
-        "    if run.outputs is None:\n",
-        "        continue\n",
-        "    eval_feedback = []\n",
-        "    for evaluator in evaluators:\n",
-        "        eval_feedback.append(client.aevaluate_run(run, evaluator))\n",
-        "    feedbacks.extend(await asyncio.gather(*eval_feedback))"
+    "inputs = [\n",
+    "    \"What's the ratio of the current US GDP to the average lifespan of a human?\",\n",
+    "    \"What's sin of 180 degrees?\",\n",
+    "    \"I need help on my homework\",\n",
+    "    \"If the price of bushel of wheat increases by 10 cents, about how much will that impact the average cost of bread?\",\n",
+    "    # etc.\n",
+    "]\n",
+    "for query in inputs:\n",
+    "    try:\n",
+    "        await agent.arun(query)\n",
+    "    except Exception as e:\n",
+    "        print(e)"
   ]
  },
  {
-      "cell_type": "code",
-      "execution_count": 17,
-      "id": "8696f167-dc75-4ef8-8bb3-ac1ce8324f30",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-            ],
-            "text/plain": [
-              "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
-            ]
-          },
-          "execution_count": 17,
+   "cell_type": "markdown",
+   "id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
   "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
   "source": [
-        "client"
+    "## Conclusion\n",
+    "\n",
+    "Congratulations! You have succesfully created connected an agent to LangSmith to trace and debug, evaluated it for accuracy, helpfulness, and trajectory efficiency over a dataset, and instrumented a monitoring project for a simulated \"production\" application!\n",
+    "\n",
+    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better products.\n",
+    "\n",
+    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/),\n",
+    "\n",
+    "and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-      "id": "a5037e54-2c5a-4993-9b46-2a98773d3079",
+   "id": "90b7fbff-162d-4c9c-b6fc-33bd5445745f",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/tests/unit_tests/client/test_runner_utils.py
+++ b/tests/unit_tests/client/test_runner_utils.py
@ -169,8 +169,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
        example: Example,
        llm_or_chain: Union[BaseLanguageModel, Chain],
        n_repetitions: int,
-        tracer: Any,
        tags: Optional[List[str]] = None,
+        callbacks: Optional[Any] = None,
    ) -> List[Dict[str, Any]]:
        return [
            {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)