diff --git a/langchain/callbacks/tracers/evaluation.py b/langchain/callbacks/tracers/evaluation.py
new file mode 100644
index 0000000000..aefd5dd4f6
--- /dev/null
+++ b/langchain/callbacks/tracers/evaluation.py
@@ -0,0 +1,84 @@
+"""A tracer that runs evaluators over completed runs."""
+from concurrent.futures import Future, ThreadPoolExecutor, wait
+from typing import Any, Optional, Sequence, Set, Union
+from uuid import UUID
+
+from langchainplus_sdk import LangChainPlusClient, RunEvaluator
+
+from langchain.callbacks.tracers.base import BaseTracer
+from langchain.callbacks.tracers.schemas import Run
+
+
+class EvaluatorCallbackHandler(BaseTracer):
+    """A tracer that runs a run evaluator whenever a run is persisted.
+
+    Parameters
+    ----------
+    evaluators : Sequence[RunEvaluator]
+        The run evaluators to apply to all top level runs.
+    max_workers : int, optional
+        The maximum number of worker threads to use for running the evaluators.
+        If not specified, it will default to the number of evaluators.
+    client : LangChainPlusClient, optional
+        The LangChainPlusClient instance to use for evaluating the runs.
+        If not specified, a new instance will be created.
+    example_id : Union[UUID, str], optional
+        The example ID to be associated with the runs.
+
+    Attributes
+    ----------
+    example_id : Union[UUID, None]
+        The example ID associated with the runs.
+    client : LangChainPlusClient
+        The LangChainPlusClient instance used for evaluating the runs.
+    evaluators : Sequence[RunEvaluator]
+        The sequence of run evaluators to be executed.
+    executor : ThreadPoolExecutor
+        The thread pool executor used for running the evaluators.
+    futures : Set[Future]
+        The set of futures representing the running evaluators.
+    """
+
+    name = "evaluator_callback_handler"
+
+    def __init__(
+        self,
+        evaluators: Sequence[RunEvaluator],
+        max_workers: Optional[int] = None,
+        client: Optional[LangChainPlusClient] = None,
+        example_id: Optional[Union[UUID, str]] = None,
+        **kwargs: Any
+    ) -> None:
+        super().__init__(**kwargs)
+        self.example_id = (
+            UUID(example_id) if isinstance(example_id, str) else example_id
+        )
+        self.client = client or LangChainPlusClient()
+        self.evaluators = evaluators
+        self.executor = ThreadPoolExecutor(
+            max_workers=max(max_workers or len(evaluators), 1)
+        )
+        self.futures: Set[Future] = set()
+
+    def _persist_run(self, run: Run) -> None:
+        """Run the evaluator on the run.
+
+        Parameters
+        ----------
+        run : Run
+            The run to be evaluated.
+
+        """
+        run_ = run.copy()
+        run_.reference_example_id = self.example_id
+        for evaluator in self.evaluators:
+            self.futures.add(
+                self.executor.submit(self.client.evaluate_run, run_, evaluator)
+            )
+
+    def wait_for_futures(self) -> None:
+        """Wait for all futures to complete."""
+        futures = list(self.futures)
+        wait(futures)
+        for future in futures:
+            self.futures.remove(future)
diff --git a/langchain/callbacks/tracers/run_collector.py b/langchain/callbacks/tracers/run_collector.py
index 6a04c5ef9d..26e0174fe8 100644
--- a/langchain/callbacks/tracers/run_collector.py
+++ b/langchain/callbacks/tracers/run_collector.py
@@ -1,20 +1,52 @@
 """A tracer that collects all nested runs in a list."""
-from typing import Any, List
+
+from typing import Any, List, Optional, Union
+from uuid import UUID
 
 from langchain.callbacks.tracers.base import BaseTracer
 from langchain.callbacks.tracers.schemas import Run
 
 
 class RunCollectorCallbackHandler(BaseTracer):
-    """A tracer that collects all nested runs in a list.
+    """
+    A tracer that collects all nested runs in a list.
+
+    This tracer is useful for inspection and evaluation purposes.
 
-    Useful for inspection and for evaluation."""
+    Parameters
+    ----------
+    example_id : Optional[Union[UUID, str]], default=None
+        The ID of the example being traced. It can be either a UUID or a string.
+    """
 
     name = "run-collector_callback_handler"
 
-    def __init__(self, **kwargs: Any) -> None:
+    def __init__(
+        self, example_id: Optional[Union[UUID, str]] = None, **kwargs: Any
+    ) -> None:
+        """
+        Initialize the RunCollectorCallbackHandler.
+
+        Parameters
+        ----------
+        example_id : Optional[Union[UUID, str]], default=None
+            The ID of the example being traced. It can be either a UUID or a string.
+        """
         super().__init__(**kwargs)
+        self.example_id = (
+            UUID(example_id) if isinstance(example_id, str) else example_id
+        )
         self.traced_runs: List[Run] = []
 
     def _persist_run(self, run: Run) -> None:
-        self.traced_runs.append(run)
+        """
+        Persist a run by adding it to the traced_runs list.
+
+        Parameters
+        ----------
+        run : Run
+            The run to be persisted.
+        """
+        run_ = run.copy()
+        run_.reference_example_id = self.example_id
+        self.traced_runs.append(run_)
diff --git a/langchain/client/runner_utils.py b/langchain/client/runner_utils.py
index a2b58aa6a3..0bb2641523 100644
--- a/langchain/client/runner_utils.py
+++ b/langchain/client/runner_utils.py
@@ -1,4 +1,5 @@
-"""Utilities for running LLMs/Chains over datasets."""
+"""Utilities for running language models or Chains over datasets."""
+
 from __future__ import annotations
 
 import asyncio
@@ -13,15 +14,18 @@ from typing import (
     Iterator,
     List,
     Optional,
+    Sequence,
     Union,
 )
 
-from langchainplus_sdk import LangChainPlusClient
+from langchainplus_sdk import LangChainPlusClient, RunEvaluator
 from langchainplus_sdk.schemas import Example
 
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.base import BaseCallbackHandler
 from langchain.callbacks.manager import Callbacks
+from langchain.callbacks.tracers.base import BaseTracer
+from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler
 from langchain.callbacks.tracers.langchain import LangChainTracer
 from langchain.chains.base import Chain
 from langchain.chat_models.base import BaseChatModel
@@ -41,11 +45,21 @@ MODEL_OR_CHAIN_FACTORY = Union[Callable[[], Chain], BaseLanguageModel]
 
 
 class InputFormatError(Exception):
-    """Raised when input format is invalid."""
+    """Raised when the input format is invalid."""
 
 
 def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
-    """Get prompts from inputs."""
+    """
+    Get prompts from inputs.
+
+    Args:
+        inputs: The input dictionary.
+
+    Returns:
+        A list of prompts.
+    Raises:
+        InputFormatError: If the input format is invalid.
+    """
     if not inputs:
         raise InputFormatError("Inputs should not be empty.")
 
@@ -83,7 +97,17 @@ def _get_prompts(inputs: Dict[str, Any]) -> List[str]:
 
 
 def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
-    """Get Chat Messages from inputs."""
+    """
+    Get Chat Messages from inputs.
+
+    Args:
+        inputs: The input dictionary.
+
+    Returns:
+        A list of chat messages.
+    Raises:
+        InputFormatError: If the input format is invalid.
+    """
     if not inputs:
         raise InputFormatError("Inputs should not be empty.")
 
@@ -112,13 +136,25 @@ def _get_messages(inputs: Dict[str, Any]) -> List[List[BaseMessage]]:
 async def _arun_llm(
     llm: BaseLanguageModel,
     inputs: Dict[str, Any],
-    langchain_tracer: Optional[LangChainTracer],
     *,
     tags: Optional[List[str]] = None,
+    callbacks: Callbacks = None,
 ) -> Union[LLMResult, ChatResult]:
-    callbacks: Optional[List[BaseCallbackHandler]] = (
-        [langchain_tracer] if langchain_tracer else None
-    )
+    """
+    Asynchronously run the language model.
+
+    Args:
+        llm: The language model to run.
+        inputs: The input dictionary.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        The LLMResult or ChatResult.
+    Raises:
+        ValueError: If the LLM type is unsupported.
+        InputFormatError: If the input format is invalid.
+    """
     if isinstance(llm, BaseLLM):
         try:
             llm_prompts = _get_prompts(inputs)
@@ -152,18 +188,32 @@ async def _arun_llm_or_chain(
     example: Example,
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
     n_repetitions: int,
-    langchain_tracer: Optional[LangChainTracer],
     *,
     tags: Optional[List[str]] = None,
+    callbacks: Optional[List[BaseCallbackHandler]] = None,
 ) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """Run the chain asynchronously."""
-    if langchain_tracer is not None:
-        previous_example_id = langchain_tracer.example_id
-        langchain_tracer.example_id = example.id
-        callbacks: Optional[List[BaseCallbackHandler]] = [langchain_tracer]
+    """
+    Asynchronously run the Chain or language model.
+
+    Args:
+        example: The example to run.
+        llm_or_chain_factory: The Chain or language model constructor to run.
+        n_repetitions: The number of times to run the model on each example.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        A list of outputs.
+    """
+    if callbacks:
+        previous_example_ids = [
+            getattr(tracer, "example_id", None) for tracer in callbacks
+        ]
+        for tracer in callbacks:
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example.id
     else:
-        previous_example_id = None
-        callbacks = None
+        previous_example_ids = None
     outputs = []
     for _ in range(n_repetitions):
         try:
@@ -171,8 +221,8 @@ async def _arun_llm_or_chain(
                 output: Any = await _arun_llm(
                     llm_or_chain_factory,
                     example.inputs,
-                    langchain_tracer,
                     tags=tags,
+                    callbacks=callbacks,
                 )
             else:
                 chain = llm_or_chain_factory()
@@ -183,15 +233,19 @@ async def _arun_llm_or_chain(
         except Exception as e:
             logger.warning(f"Chain failed for example {example.id}. Error: {e}")
             outputs.append({"Error": str(e)})
-    if langchain_tracer is not None:
-        langchain_tracer.example_id = previous_example_id
+    if callbacks and previous_example_ids:
+        for example_id, tracer in zip(previous_example_ids, callbacks):
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example_id
     return outputs
 
 
 async def _gather_with_concurrency(
     n: int,
-    initializer: Callable[[], Coroutine[Any, Any, Optional[LangChainTracer]]],
-    *async_funcs: Callable[[Optional[LangChainTracer], Dict], Coroutine[Any, Any, Any]],
+    initializer: Callable[[], Coroutine[Any, Any, Any]],
+    *async_funcs: Callable[
+        [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
+    ],
 ) -> List[Any]:
     """
     Run coroutines with a concurrency limit.
@@ -207,37 +261,42 @@ async def _gather_with_concurrency(
     semaphore = asyncio.Semaphore(n)
     job_state = {"num_processed": 0}
 
-    tracer_queue: asyncio.Queue[Optional[LangChainTracer]] = asyncio.Queue()
+    callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue()
     for _ in range(n):
-        tracer_queue.put_nowait(await initializer())
+        callback_queue.put_nowait(await initializer())
 
     async def run_coroutine_with_semaphore(
         async_func: Callable[
-            [Optional[LangChainTracer], Dict], Coroutine[Any, Any, Any]
+            [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any]
         ]
     ) -> Any:
         async with semaphore:
-            tracer = await tracer_queue.get()
+            callbacks = await callback_queue.get()
             try:
-                result = await async_func(tracer, job_state)
+                result = await async_func(callbacks, job_state)
             finally:
-                tracer_queue.put_nowait(tracer)
+                callback_queue.put_nowait(callbacks)
             return result
 
     results = await asyncio.gather(
         *(run_coroutine_with_semaphore(function) for function in async_funcs)
     )
-    while tracer_queue:
+    while callback_queue:
         try:
-            tracer = tracer_queue.get_nowait()
+            callbacks = callback_queue.get_nowait()
         except asyncio.QueueEmpty:
             break
-        if tracer:
-            tracer.wait_for_futures()
+        for callback in callbacks:
+            if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)):
+                callback.wait_for_futures()
     return results
 
 
-async def _tracer_initializer(project_name: Optional[str]) -> Optional[LangChainTracer]:
+async def _callbacks_initializer(
+    project_name: Optional[str],
+    client: LangChainPlusClient,
+    run_evaluators: Sequence[RunEvaluator],
+) -> List[BaseTracer]:
     """
     Initialize a tracer to share across tasks.
 
@@ -247,11 +306,19 @@ async def _tracer_initializer(project_name: Optional[str]) -> Optional[LangChain
     Returns:
         A LangChainTracer instance with an active project.
     """
+    callbacks: List[BaseTracer] = []
     if project_name:
-        tracer = LangChainTracer(project_name=project_name)
-        return tracer
-    else:
-        return None
+        callbacks.append(LangChainTracer(project_name=project_name))
+    if run_evaluators:
+        callbacks.append(
+            EvaluatorCallbackHandler(
+                client=client,
+                evaluators=run_evaluators,
+                # We already have concurrency, don't want to overload the machine
+                max_workers=1,
+            )
+        )
+    return callbacks
 
 
 async def arun_on_examples(
@@ -262,13 +329,16 @@ async def arun_on_examples(
     num_repetitions: int = 1,
     project_name: Optional[str] = None,
     verbose: bool = False,
+    client: Optional[LangChainPlusClient] = None,
     tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
     """
-    Run the chain on examples and store traces to the specified project name.
+    Asynchronously run the chain on examples and store traces
+        to the specified project name.
 
     Args:
-        examples: Examples to run the model or chain over
+        examples: Examples to run the model or chain over.
         llm_or_chain_factory: Language model or Chain constructor to run
             over the dataset. The Chain constructor is used to permit
             independent calls on each example without carrying over state.
@@ -277,24 +347,35 @@ async def arun_on_examples(
             This is useful when testing success rates or generating confidence
             intervals.
         project_name: Project name to use when tracing runs.
+            Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
-        tags: Tags to add to the traces.
+        client: Client to use to read the dataset. If not provided, a new
+            client will be created using the credentials in the environment.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.
 
     Returns:
         A dictionary mapping example ids to the model outputs.
     """
+    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    client_ = client or LangChainPlusClient()
+    client_.create_project(project_name, mode="eval")
+
     results: Dict[str, List[Any]] = {}
+    evaluation_handler = EvaluatorCallbackHandler(
+        evaluators=run_evaluators or [], client=client_
+    )
 
     async def process_example(
-        example: Example, tracer: Optional[LangChainTracer], job_state: dict
+        example: Example, callbacks: List[BaseCallbackHandler], job_state: dict
     ) -> None:
         """Process a single example."""
         result = await _arun_llm_or_chain(
             example,
             llm_or_chain_factory,
             num_repetitions,
-            tracer,
             tags=tags,
+            callbacks=callbacks,
         )
         results[str(example.id)] = result
         job_state["num_processed"] += 1
@@ -307,9 +388,15 @@ async def arun_on_examples(
 
     await _gather_with_concurrency(
         concurrency_level,
-        functools.partial(_tracer_initializer, project_name),
+        functools.partial(
+            _callbacks_initializer,
+            project_name=project_name,
+            client=client_,
+            run_evaluators=run_evaluators or [],
+        ),
         *(functools.partial(process_example, e) for e in examples),
     )
+    evaluation_handler.wait_for_futures()
     return results
 
 
@@ -320,7 +407,21 @@ def run_llm(
     *,
     tags: Optional[List[str]] = None,
 ) -> Union[LLMResult, ChatResult]:
-    """Run the language model on the example."""
+    """
+    Run the language model on the example.
+
+    Args:
+        llm: The language model to run.
+        inputs: The input dictionary.
+        callbacks: The callbacks to use during the run.
+        tags: Optional tags to add to the run.
+
+    Returns:
+        The LLMResult or ChatResult.
+    Raises:
+        ValueError: If the LLM type is unsupported.
+        InputFormatError: If the input format is invalid.
+    """
     if isinstance(llm, BaseLLM):
         try:
             llm_prompts = _get_prompts(inputs)
@@ -350,18 +451,32 @@ def run_llm_or_chain(
     example: Example,
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
     n_repetitions: int,
-    langchain_tracer: Optional[LangChainTracer] = None,
     *,
     tags: Optional[List[str]] = None,
+    callbacks: Optional[List[BaseCallbackHandler]] = None,
 ) -> Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
-    """Run the chain synchronously."""
-    if langchain_tracer is not None:
-        previous_example_id = langchain_tracer.example_id
-        langchain_tracer.example_id = example.id
-        callbacks: Optional[List[BaseCallbackHandler]] = [langchain_tracer]
+    """
+    Run the Chain or language model synchronously.
+
+    Args:
+        example: The example to run.
+        llm_or_chain_factory: The Chain or language model constructor to run.
+        n_repetitions: The number of times to run the model on each example.
+        tags: Optional tags to add to the run.
+        callbacks: Optional callbacks to use during the run.
+
+    Returns:
+        A list of outputs.
+    """
+    if callbacks:
+        previous_example_ids = [
+            getattr(tracer, "example_id", None) for tracer in callbacks
+        ]
+        for tracer in callbacks:
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example.id
     else:
-        previous_example_id = None
-        callbacks = None
+        previous_example_ids = None
     outputs = []
     for _ in range(n_repetitions):
         try:
@@ -376,8 +491,10 @@ def run_llm_or_chain(
         except Exception as e:
             logger.warning(f"Chain failed for example {example.id}. Error: {e}")
             outputs.append({"Error": str(e)})
-    if langchain_tracer is not None:
-        langchain_tracer.example_id = previous_example_id
+    if callbacks and previous_example_ids:
+        for example_id, tracer in zip(previous_example_ids, callbacks):
+            if hasattr(tracer, "example_id"):
+                tracer.example_id = example_id
     return outputs
 
 
@@ -388,48 +505,74 @@ def run_on_examples(
     num_repetitions: int = 1,
     project_name: Optional[str] = None,
     verbose: bool = False,
+    client: Optional[LangChainPlusClient] = None,
     tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
-    """Run the chain on examples and store traces to the specified project name.
+    """
+    Run the Chain or language model on examples and store
+    traces to the specified project name.
 
     Args:
-        examples: Examples to run model or chain over.
+        examples: Examples to run the model or chain over.
         llm_or_chain_factory: Language model or Chain constructor to run
             over the dataset. The Chain constructor is used to permit
             independent calls on each example without carrying over state.
-        concurrency_level: Number of async workers to run in parallel.
         num_repetitions: Number of times to run the model on each example.
             This is useful when testing success rates or generating confidence
             intervals.
-        project_name: Project name to use when tracing runs.
+        project_name: Name of the project to store the traces in.
+            Defaults to {dataset_name}-{chain class name}-{datetime}.
         verbose: Whether to print progress.
-        tags: Tags to add to the run traces.
+        client: Client to use to access the dataset. If None, a new client
+            will be created using the credentials in the environment.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.
+
     Returns:
         A dictionary mapping example ids to the model outputs.
     """
     results: Dict[str, Any] = {}
-    tracer = LangChainTracer(project_name=project_name) if project_name else None
+    project_name = _get_project_name(project_name, llm_or_chain_factory, None)
+    client_ = client or LangChainPlusClient()
+    client_.create_project(project_name, mode="eval")
+    tracer = LangChainTracer(project_name=project_name)
+    evalution_handler = EvaluatorCallbackHandler(
+        evaluators=run_evaluators or [], client=client_
+    )
+    callbacks: List[BaseCallbackHandler] = [tracer, evalution_handler]
     for i, example in enumerate(examples):
         result = run_llm_or_chain(
             example,
             llm_or_chain_factory,
             num_repetitions,
-            langchain_tracer=tracer,
             tags=tags,
+            callbacks=callbacks,
         )
         if verbose:
             print(f"{i+1} processed", flush=True, end="\r")
         results[str(example.id)] = result
-    if tracer:
-        tracer.wait_for_futures()
+    tracer.wait_for_futures()
+    evalution_handler.wait_for_futures()
     return results
 
 
 def _get_project_name(
     project_name: Optional[str],
     llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY,
-    dataset_name: str,
+    dataset_name: Optional[str],
 ) -> str:
+    """
+    Get the project name.
+
+    Args:
+        project_name: The project name if manually specified.
+        llm_or_chain_factory: The Chain or language model constructor.
+        dataset_name: The dataset name.
+
+    Returns:
+        The project name.
+    """
     if project_name is not None:
         return project_name
     current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
@@ -437,7 +580,8 @@ def _get_project_name(
         model_name = llm_or_chain_factory.__class__.__name__
     else:
         model_name = llm_or_chain_factory().__class__.__name__
-    return f"{dataset_name}-{model_name}-{current_time}"
+    dataset_prefix = f"{dataset_name}-" if dataset_name else ""
+    return f"{dataset_prefix}{model_name}-{current_time}"
 
 
 async def arun_on_dataset(
@@ -450,12 +594,13 @@ async def arun_on_dataset(
     verbose: bool = False,
     client: Optional[LangChainPlusClient] = None,
     tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
     """
-    Run the chain on a dataset and store traces to the specified project name.
+    Asynchronously run the Chain or language model on a dataset
+    and store traces to the specified project name.
 
     Args:
-        client: Client to use to read the dataset.
         dataset_name: Name of the dataset to run the chain on.
         llm_or_chain_factory: Language model or Chain constructor to run
             over the dataset. The Chain constructor is used to permit
@@ -469,7 +614,8 @@ async def arun_on_dataset(
         verbose: Whether to print progress.
         client: Client to use to read the dataset. If not provided, a new
             client will be created using the credentials in the environment.
-        tags: Tags to add to each run in the sesssion.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.
 
     Returns:
         A dictionary containing the run's project name and the resulting model outputs.
@@ -478,7 +624,6 @@ async def arun_on_dataset(
     project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name)
     dataset = client_.read_dataset(dataset_name=dataset_name)
     examples = client_.list_examples(dataset_id=str(dataset.id))
-
     results = await arun_on_examples(
         examples,
         llm_or_chain_factory,
@@ -486,7 +631,9 @@ async def arun_on_dataset(
         num_repetitions=num_repetitions,
         project_name=project_name,
         verbose=verbose,
+        client=client_,
         tags=tags,
+        run_evaluators=run_evaluators,
     )
     return {
         "project_name": project_name,
@@ -503,8 +650,11 @@ def run_on_dataset(
     verbose: bool = False,
     client: Optional[LangChainPlusClient] = None,
     tags: Optional[List[str]] = None,
+    run_evaluators: Optional[Sequence[RunEvaluator]] = None,
 ) -> Dict[str, Any]:
-    """Run the chain on a dataset and store traces to the specified project name.
+    """
+    Run the Chain or language model on a dataset and store traces
+    to the specified project name.
 
     Args:
         dataset_name: Name of the dataset to run the chain on.
@@ -520,7 +670,8 @@ def run_on_dataset(
         verbose: Whether to print progress.
         client: Client to use to access the dataset. If None, a new client
             will be created using the credentials in the environment.
-        tags: Tags to add to each run in the sesssion.
+        tags: Tags to add to each run in the project.
+        run_evaluators: Evaluators to run on the results of the chain.
 
     Returns:
         A dictionary containing the run's project name and the resulting model outputs.
@@ -536,6 +687,8 @@ def run_on_dataset(
         project_name=project_name,
         verbose=verbose,
         tags=tags,
+        run_evaluators=run_evaluators,
+        client=client_,
     )
     return {
         "project_name": project_name,
diff --git a/langchain/evaluation/run_evaluators/implementations.py b/langchain/evaluation/run_evaluators/implementations.py
index 675f01988c..801745f981 100644
--- a/langchain/evaluation/run_evaluators/implementations.py
+++ b/langchain/evaluation/run_evaluators/implementations.py
@@ -117,10 +117,12 @@ def get_qa_evaluator(
             choices_map={"CORRECT": 1, "INCORRECT": 0},
         ),
     )
+    tags = kwargs.pop("tags", [])
     return RunEvaluatorChain(
         eval_chain=eval_chain,
         input_mapper=input_mapper,
         output_parser=output_parser,
+        tags=tags + [evaluation_name],
         **kwargs,
     )
 
@@ -174,6 +176,7 @@ def get_criteria_evaluator(
             choices_map={"Y": 1, "N": 0}, evaluation_name=evaluation_name
         ),
     )
+    tags = kwargs.pop("tags", [])
     eval_chain = CriteriaEvalChain.from_llm(
         llm=llm, criteria=criteria_, prompt=prompt, **kwargs
     )
@@ -181,6 +184,7 @@ def get_criteria_evaluator(
         eval_chain=eval_chain,
         input_mapper=input_mapper,
         output_parser=parser,
+        tags=tags + [evaluation_name],
         **kwargs,
     )
 
@@ -303,9 +307,11 @@ def get_trajectory_evaluator(
         TrajectoryEvalOutputParser(evaluation_name=evaluation_name),
     )
     eval_chain = LLMChain(llm=llm, prompt=prompt, **kwargs)
+    tags = kwargs.pop("tags", [])
     return RunEvaluatorChain(
         eval_chain=eval_chain,
         input_mapper=input_mapper,
         output_parser=parser,
+        tags=tags + [evaluation_name],
         **kwargs,
     )
diff --git a/langchain/experimental/client/tracing_datasets.ipynb b/langchain/experimental/client/tracing_datasets.ipynb
index f3929c34f6..d7fb757942 100644
--- a/langchain/experimental/client/tracing_datasets.ipynb
+++ b/langchain/experimental/client/tracing_datasets.ipynb
@@ -1,677 +1,705 @@
 {
-  "cells": [
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
-      "metadata": {
-        "tags": []
-      },
-      "source": [
-        "# Tracing and Datasets with LangChainPlus\n",
-        "\n",
-        "LangChain makes it easy to get started with Agents and other LLM applications. However, it can be tricky to get right, especially when you need to deliver a full product. To speed up your application development process, and to help monitor your applications in production, LangChain offers additional tracing and tooling.\n",
-        "\n",
-        "When might you want to use tracing? Some situations we've found it useful include:\n",
-        "- Quickly debugging a new chain, agent, or set of tools\n",
-        "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
-        "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
-        "\n",
-        "\n",
-        "In this notebook, we'll show how to enable tracing in your LangChain applications and walk you a couple common ways to evaluate your agents.\n",
-        "We'll focus on using Datasets to benchmark Chain behavior.\n",
-        "\n",
-        "**Bear in mind that this notebook is designed under the assumption that you're running the latest LangChain+ server locally in the background. This is done using the folowing command in your terminal:**\n",
-        "\n",
-        "\n",
-        "```\n",
-        "pip install --upgrade langchain\n",
-        "langchain plus start\n",
-        "```\n",
-        "\n",
-        "We also have a hosted version which is in private beta. We will share more details as it progresses.\n",
-        "\n",
-        "Now, let's get started by creating a client to connect to LangChain+."
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "2d77d064-41b4-41fb-82e6-2d16461269ec",
-      "metadata": {
-        "tags": []
-      },
-      "source": [
-        "## Setting up Tracing\n",
-        "\n",
-        "The V2 tracing API can be activated by setting the `LANGCHAIN_TRACING_V2` environment variable to true. Assuming you've successfully initiated the server as described earlier, running LangChain Agents, Chains, LLMs, and other primitives will automatically start capturing traces. Let's begin our exploration with a straightforward math example.\n",
-        "\n",
-        "**NOTE**: You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "7935e832-9ae1-4557-8d08-890c425f18e2",
-      "metadata": {},
-      "source": [
-        "**NOTE:** You can also use the `tracing_v2_enabled` context manager to capture projects within a given context:\n",
-        "```\n",
-        "from langchain.callbacks.manager import tracing_v2_enabled\n",
-        "with tracing_v2_enabled(\"My Project Name\"):\n",
-        "    ...\n",
-        "```"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "221b638a-2ae4-46ef-bf6a-d59bf85d587f",
-      "metadata": {},
-      "source": [
-        "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "You can click the link below to view the UI\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-            ],
-            "text/plain": [
-              "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
-            ]
-          },
-          "execution_count": 1,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from langchainplus_sdk import LangChainPlusClient\n",
-        "\n",
-        "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
-        "os.environ[\"LANGCHAIN_PROJECT\"] = \"Tracing Walkthrough\"\n",
-        "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line if you want to use the hosted version\n",
-        "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line if you want to use the hosted version.\n",
-        "\n",
-        "client = LangChainPlusClient()\n",
-        "print(\"You can click the link below to view the UI\")\n",
-        "client"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "id": "7c801853-8e96-404d-984c-51ace59cbbef",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "from langchain.chat_models import ChatOpenAI\n",
-        "from langchain.agents import initialize_agent, load_tools\n",
-        "from langchain.agents import AgentType\n",
-        "\n",
-        "llm = ChatOpenAI(temperature=0)\n",
-        "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-        "agent = initialize_agent(\n",
-        "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
-        ")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "id": "19537902-b95c-4390-80a4-f6c9a937081e",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "import asyncio\n",
-        "\n",
-        "inputs = [\n",
-        "    \"How many people live in canada as of 2023?\",\n",
-        "    \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
-        "    \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
-        "    \"how far is it from paris to boston in miles\",\n",
-        "    \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n",
-        "    \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n",
-        "    \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n",
-        "    \"what is 153 raised to .1312 power?\",\n",
-        "    \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
-        "    \"what is 1213 divided by 4345?\",\n",
-        "]\n",
-        "results = []\n",
-        "\n",
-        "\n",
-        "async def arun(agent, input_example):\n",
-        "    try:\n",
-        "        return await agent.arun(input_example)\n",
-        "    except Exception as e:\n",
-        "        # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
-        "        return e\n",
-        "\n",
-        "\n",
-        "for input_example in inputs:\n",
-        "    results.append(arun(agent, input_example))\n",
-        "results = await asyncio.gather(*results)"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
-      "metadata": {},
-      "source": [
-        "## Creating the Dataset\n",
-        "\n",
-        "Now that you've captured a project entitled 'Tracing Walkthrough', it's time to create a dataset. We will do so using the `create_dataset` method below."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "dataset_name = \"calculator-example-dataset\""
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 5,
-      "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
-        "    client.delete_dataset(dataset_name=dataset_name)\n",
-        "dataset = client.create_dataset(\n",
-        "    dataset_name, description=\"A calculator example dataset\"\n",
-        ")\n",
-        "runs = client.list_runs(\n",
-        "    project_name=os.environ[\"LANGCHAIN_PROJECT\"],\n",
-        "    execution_order=1,  # Only return the top-level runs\n",
-        "    error=False,  # Only runs that succeed\n",
-        ")\n",
-        "for run in runs:\n",
-        "    if run.outputs is None:\n",
-        "        continue\n",
-        "    try:\n",
-        "        client.create_example(\n",
-        "            inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
-        "        )\n",
-        "    except:\n",
-        "        pass"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3",
-      "metadata": {
-        "jp-MarkdownHeadingCollapsed": true,
-        "tags": []
-      },
-      "source": [
-        "**Alternative: Creating a Dataset in the UI** \n",
-        "\n",
-        "Alternatively, you could create or edit the dataset in the UI using the following steps:\n",
-        "\n",
-        "   1. Navigate to the UI by clicking on the link below.\n",
-        "   2. Select the 'search_and_math_chain' project from the list.\n",
-        "   3. Next to the fist example, click \"+ to Dataset\".\n",
-        "   4. Click \"Create Dataset\" and create a title **\"calculator-example-dataset\"**.\n",
-        "   5. Add the other examples to the dataset as well\n",
-        "\n",
-        "Once you've used LangChain+ for a while, you will have a number of datasets to work with. To view all saved datasets, execute the following code:\n",
-        "\n",
-        "```\n",
-        "datasets = client.list_datasets()\n",
-        "print(datasets)\n",
-        "```\n",
-        "\n",
-        "\n",
-        "**Optional:** If you didn't run the trace above, you can also create datasets by uploading dataframes or CSV files."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "id": "1baa677c-5642-4378-8e01-3aa1647f19d6",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# !pip install datasets > /dev/null\n",
-        "# !pip install pandas > /dev/null"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "id": "60d14593-c61f-449f-a38f-772ca43707c2",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# import pandas as pd\n",
-        "# from langchain.evaluation.loading import load_dataset\n",
-        "\n",
-        "# dataset = load_dataset(\"agent-search-calculator\")\n",
-        "# df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n",
-        "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key\n",
-        "# df.head()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 8,
-      "id": "52a7ea76-79ca-4765-abf7-231e884040d6",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# dataset_name = \"calculator-example-dataset\"\n",
-        "\n",
-        "# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n",
-        "#     dataset = client.upload_dataframe(df,\n",
-        "#                             name=dataset_name,\n",
-        "#                             description=\"A calculator example dataset\",\n",
-        "#                             input_keys=[\"input\"],\n",
-        "#                             output_keys=[\"output\"],\n",
-        "#                    )"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "07885b10",
-      "metadata": {
-        "tags": []
-      },
-      "source": [
-        "## Running a Chain on a Traced Dataset\n",
-        "\n",
-        "Once you have a dataset, you can run a compatible chain or other object over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n",
-        "\n",
-        "**First, we'll define the chain we wish to run over the dataset.**\n",
-        "\n",
-        "In this case, we're using an agent, but it can be any simple chain."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "from langchain.chat_models import ChatOpenAI\n",
-        "from langchain.agents import initialize_agent, load_tools\n",
-        "from langchain.agents import AgentType\n",
-        "\n",
-        "llm = ChatOpenAI(temperature=0)\n",
-        "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
-        "agent = initialize_agent(\n",
-        "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
-        ")"
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "84094a4a-1d76-461c-bc37-8c537939b466",
-      "metadata": {},
-      "source": [
-        "**Now we're ready to run the chain!**\n",
-        "\n",
-        "The docstring below hints ways you can configure the method to run."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "112d7bdf-7e50-4c1a-9285-5bac8473f2ee",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "\u001b[0;31mSignature:\u001b[0m\n",
-              "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m    \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
-              "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-              "\u001b[0;31mDocstring:\u001b[0m\n",
-              "Run the chain on a dataset and store traces to the specified project name.\n",
-              "\n",
-              "Args:\n",
-              "    client: Client to use to read the dataset.\n",
-              "    dataset_name: Name of the dataset to run the chain on.\n",
-              "    llm_or_chain_factory: Language model or Chain constructor to run\n",
-              "        over the dataset. The Chain constructor is used to permit\n",
-              "        independent calls on each example without carrying over state.\n",
-              "    concurrency_level: The number of async tasks to run concurrently.\n",
-              "    num_repetitions: Number of times to run the model on each example.\n",
-              "        This is useful when testing success rates or generating confidence\n",
-              "        intervals.\n",
-              "    project_name: Name of the project to store the traces in.\n",
-              "        Defaults to {dataset_name}-{chain class name}-{datetime}.\n",
-              "    verbose: Whether to print progress.\n",
-              "    client: Client to use to read the dataset. If not provided, a new\n",
-              "        client will be created using the credentials in the environment.\n",
-              "    tags: Tags to add to each run in the sesssion.\n",
-              "\n",
-              "Returns:\n",
-              "    A dictionary containing the run's project name and the resulting model outputs.\n",
-              "\u001b[0;31mFile:\u001b[0m      ~/code/lc/lckg/langchain/client/runner_utils.py\n",
-              "\u001b[0;31mType:\u001b[0m      function"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "from langchain.client import arun_on_dataset\n",
-        "\n",
-        "?arun_on_dataset"
-      ]
-    },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1a4596ea-a631-416d-a2a4-3577c140493d",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Debug, Evaluate, and Monitor LLMs with LangSmith\n",
+    "\n",
+    "LangChain makes it easy to get started with Agents and other LLM applications. Even so, delivering a high-quality agent to production can be deceptively difficult. To aid the development process, we've designed tracing and callbacks at the core of LangChain. In this notebook, you will get started prototyping, testing, and monitoring an LLM agent.\n",
+    "\n",
+    "When might you want to use tracing? Some situations we've found it useful include:\n",
+    "- Quickly debugging a new chain, agent, or set of tools\n",
+    "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n",
+    "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n",
+    "- Capturing production traces and using LangChain summarizers to analyze app usage"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "138fbb8f-960d-4d26-9dd5-6d6acab3ee55",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "**Either [create a hosted LangSmith account](https://www.langchain.plus/) and connect with an API key OR\n",
+    "run the server locally.**\n",
+    "\n",
+    "\n",
+    "To run the local server, execute the following comand in your terminal:\n",
+    "```\n",
+    "pip install --upgrade langchain\n",
+    "langchain plus start\n",
+    "```\n",
+    "\n",
+    "Now, let's get started by creating a client to connect to LangChain+."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d77d064-41b4-41fb-82e6-2d16461269ec",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Debug your Agent\n",
+    "\n",
+    "First, configure your environment variables to tell LangChain to log traces. This is done by setting the `LANGCHAIN_TRACING_V2` environment variable to true.\n",
+    "You can tell LangChain which project to log to by setting the `LANGCHAIN_PROJECT` environment variable. This will automatically create a debug project for you.\n",
+    "\n",
+    "For more information on other ways to set up tracing, please reference the [LangSmith documentation](https://docs.langchain.plus/docs/)\n",
+    "\n",
+    "**NOTE:** You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n",
+    "\n",
+    "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "904db9a5-f387-4a57-914c-c8af8d39e249",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 11,
-      "id": "6e10f823",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
-        "# a way to initialize a new chain for each row in the dataset. This is done\n",
-        "# by passing in a factory function that returns a new chain for each row.\n",
-        "chain_factory = lambda: initialize_agent(\n",
-        "    tools,\n",
-        "    llm,\n",
-        "    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n",
-        "    verbose=False,\n",
-        ")\n",
-        "\n",
-        "# If your chain is NOT stateful, your lambda can return the object directly\n",
-        "# to improve runtime performance. For example:\n",
-        "# chain_factory = lambda: agent"
-      ]
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "You can click the link below to view the UI\n"
+     ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 13,
-      "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Processed examples: 1\r"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "Chain failed for example b36a82d3-4fb6-4bc4-87df-b7c355742b8e. Error: unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Processed examples: 6\r"
-          ]
-        }
+     "data": {
+      "text/html": [
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
       ],
-      "source": [
-        "chain_results = await arun_on_dataset(\n",
-        "    dataset_name=dataset_name,\n",
-        "    llm_or_chain_factory=chain_factory,\n",
-        "    concurrency_level=5,  # Optional, sets the number of examples to run at a time\n",
-        "    verbose=True,\n",
-        "    client=client,\n",
-        "    tags=[\n",
-        "        \"testing-notebook\",\n",
-        "        \"turbo\",\n",
-        "    ],  # Optional, adds a tag to the resulting chain runs\n",
-        ")\n",
-        "\n",
-        "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
-        "# These are logged as warnings here and captured as errors in the tracing UI."
-      ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
-      "metadata": {
-        "tags": []
-      },
-      "source": [
-        "### Reviewing the Chain Results\n",
-        "\n",
-        "You can review the results of the run in the tracing UI below and navigating to the project \n",
-        "with the title **\"Search + Calculator Agent Evaluation\"**"
+      "text/plain": [
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
       ]
-    },
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from uuid import uuid4\n",
+    "from langchainplus_sdk import LangChainPlusClient\n",
+    "\n",
+    "unique_id = uuid4().hex[0:8]\n",
+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = f\"Tracing Walkthrough - {unique_id}\"\n",
+    "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"  # Uncomment this line to use the hosted version\n",
+    "# os.environ[\"LANGCHAIN_API_KEY\"] = \"<YOUR-LANGCHAINPLUS-API-KEY>\"  # Uncomment this line to use the hosted version.\n",
+    "\n",
+    "# Used by the agent below\n",
+    "# os.environ[\"OPENAI_API_KEY\"] = \"<YOUR-OPENAI-API-KEY>\"\n",
+    "# os.environ[\"SERPAPI_API_KEY\"] = \"<YOUR-SERPAPI-API-KEY>\"\n",
+    "\n",
+    "client = LangChainPlusClient()\n",
+    "print(\"You can click the link below to view the UI\")\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca27fa11-ddce-4af0-971e-c5c37d5b92ef",
+   "metadata": {},
+   "source": [
+    "Now, start prototyping your agent. We will use a straightforward math example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7c801853-8e96-404d-984c-51ace59cbbef",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.agents import initialize_agent, load_tools\n",
+    "from langchain.agents import AgentType\n",
+    "\n",
+    "llm = ChatOpenAI(temperature=0)\n",
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "agent = initialize_agent(\n",
+    "    tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "19537902-b95c-4390-80a4-f6c9a937081e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "\n",
+    "inputs = [\n",
+    "    \"How many people live in canada as of 2023?\",\n",
+    "    \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n",
+    "    \"what is dua lipa's boyfriend age raised to the .43 power?\",\n",
+    "    \"how far is it from paris to boston in miles\",\n",
+    "    \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n",
+    "    \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n",
+    "    \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n",
+    "    \"what is 153 raised to .1312 power?\",\n",
+    "    \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n",
+    "    \"what is 1213 divided by 4345?\",\n",
+    "]\n",
+    "results = []\n",
+    "\n",
+    "\n",
+    "async def arun(agent, input_example):\n",
+    "    try:\n",
+    "        return await agent.arun(input_example)\n",
+    "    except Exception as e:\n",
+    "        # The agent sometimes makes mistakes! These will be captured by the tracing.\n",
+    "        return e\n",
+    "\n",
+    "\n",
+    "for input_example in inputs:\n",
+    "    results.append(arun(agent, input_example))\n",
+    "results = await asyncio.gather(*results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0405ff30-21fe-413d-85cf-9fa3c649efec",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.callbacks.tracers.langchain import wait_for_all_tracers\n",
+    "\n",
+    "# Logs are submitted in a background thread. Make sure they've been submitted before moving on.\n",
+    "wait_for_all_tracers()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9decb964-be07-4b6c-9802-9825c8be7b64",
+   "metadata": {},
+   "source": [
+    "Assuming you've successfully initiated the server as described earlier, your agent logs should show up in your server. You can check by clicking on the link below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b7bc3934-bb1a-452c-a723-f9cdb0b416f9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 14,
-      "id": "136db492-d6ca-4215-96f9-439c23538241",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-            ],
-            "text/plain": [
-              "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
-            ]
-          },
-          "execution_count": 14,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
       ],
-      "source": [
-        "# You can navigate to the UI by clicking on the link below\n",
-        "client"
+      "text/plain": [
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
       ]
-    },
-    {
-      "attachments": {},
-      "cell_type": "markdown",
-      "id": "63ed6561-6574-43b3-a653-fe410aa8a617",
-      "metadata": {},
-      "source": [
-        "## Running an Evaluation Chain\n",
-        "\n",
-        "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
-        "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
-        "\n",
-        "A few ways of doing this include:\n",
-        "- Adding ground-truth answers as outputs to the dataset and evaluating relative to those references.\n",
-        "- Evaluating the overall agent trajectory based on the tool usage and intermediate steps.\n",
-        "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
-        "- Evaluating 'aspects' of the agent's response in a reference-free manner using targeted agent prompts.\n",
-        "    \n",
-        "Below, we show how to run an evaluation chain that compares the model output with the ground-truth answers.\n",
-        "\n",
-        "**Note: the feedback API is currently experimental and subject to change.**"
-      ]
-    },
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430",
+   "metadata": {},
+   "source": [
+    "## Test\n",
+    "\n",
+    "Once you've debugged a prototype of your agent, you will want to create tests and benchmark evaluations as you think about putting it into a production environment.\n",
+    "\n",
+    "In this notebook, you will run evaluators to test an agent. You will do so in a few steps:\n",
+    "\n",
+    "1. Create a dataset\n",
+    "2. Select or create evaluators to measure performance\n",
+    "3. Define the LLM or Chain initializer to test\n",
+    "4. Run the chain and evaluators using the helper functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "beab1a29-b79d-4a99-b5b1-0870c2d772b1",
+   "metadata": {},
+   "source": [
+    "### 1. Create Dataset\n",
+    "\n",
+    "Below, use the client to create a dataset from the Agent runs you just logged while debugging above. You will use these later to measure performance.\n",
+    "\n",
+    "For more information on datasets, including how to create them from CSVs or other files or how to create them in the web app, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dataset_name = \"calculator-example-dataset\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n",
+    "    client.delete_dataset(dataset_name=dataset_name)\n",
+    "dataset = client.create_dataset(\n",
+    "    dataset_name, description=\"A calculator example dataset\"\n",
+    ")\n",
+    "\n",
+    "runs = client.list_runs(\n",
+    "    project_name=os.environ[\"LANGCHAIN_PROJECT\"],\n",
+    "    execution_order=1,  # Only return the top-level runs\n",
+    "    error=False,  # Only runs that succeed\n",
+    ")\n",
+    "for run in runs:\n",
+    "    client.create_example(\n",
+    "        inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92e8944f-e6fc-4bdf-9611-b2db39698cbe",
+   "metadata": {},
+   "source": [
+    "### 2. Select RunEvaluators\n",
+    "\n",
+    "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n",
+    "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n",
+    "\n",
+    "Below, we will create some pre-implemented run evaluators that do the following:\n",
+    "- Compare results against ground truth labels. (You used the debug outputs above for this)\n",
+    "- Evaluate the overall agent trajectory based on the tool usage and intermediate steps.\n",
+    "- Evaluating 'aspects' of the agent's response in a reference-free manner using custom criteria\n",
+    "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n",
+    "\n",
+    "For a longer discussion of how to select an appropriate evaluator for your use case and how to create your own\n",
+    "custom evaluators, please refer to the [LangSmith documentation](https://docs.langchain.plus/docs/).\n",
+    "\n",
+    "Below, create the run evaluators.\n",
+    "\n",
+    "**Note: the feedback API is currently experimental and subject to change.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "56298faa-9ff2-43a2-b35a-ee306e3bf64d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.evaluation.run_evaluators import (\n",
+    "    get_qa_evaluator,\n",
+    "    get_criteria_evaluator,\n",
+    "    get_trajectory_evaluator,\n",
+    ")\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "\n",
+    "# You can use any model, but stronger llms tend to be more reliable\n",
+    "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
+    "\n",
+    "# Measures accuracy against ground truth\n",
+    "qa_evaluator = get_qa_evaluator(eval_llm) \n",
+    "\n",
+    "# Measures how effective and efficient the agent's actions are\n",
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "trajectory_evaluator = get_trajectory_evaluator(eval_llm, agent_tools=tools)\n",
+    "\n",
+    "# Measure helpfulness. We have some pre-defined criteria you can select\n",
+    "helpfulness_evaluator = get_criteria_evaluator(\n",
+    "    eval_llm,\n",
+    "    \"helpfulness\",\n",
+    ")\n",
+    "\n",
+    "# Custom criteria are specified as a dictionary\n",
+    "custom_criteria_evaluator = get_criteria_evaluator(\n",
+    "    eval_llm,\n",
+    "    {\n",
+    "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "evaluators = [\n",
+    "    qa_evaluator,\n",
+    "    trajectory_evaluator,\n",
+    "    helpfulness_evaluator,\n",
+    "    custom_criteria_evaluator,\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8adfd29c-b258-49e5-94b4-74597a12ba16",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### 3. Define the Agent or LLM to Test\n",
+    "\n",
+    "You can evaluate any LLM or chain. Since chains can have memory, we need to pass an\n",
+    "initializer function that returns a new chain for each row.\n",
+    "\n",
+    "In this case, you will test an agent that uses OpenAI's function calling endpoints, but it can be any simple chain."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f42d8ecc-d46a-448b-a89c-04b0f6907f75",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.agents import initialize_agent, load_tools\n",
+    "from langchain.agents import AgentType\n",
+    "\n",
+    "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
+    "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
+    "\n",
+    "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
+    "# a way to initialize a new chain for each row in the dataset. This is done\n",
+    "# by passing in a factory function that returns a new chain for each row.\n",
+    "def agent_factory():\n",
+    "    return initialize_agent(\n",
+    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
+    ")\n",
+    "\n",
+    "# If your chain is NOT stateful, your factory can return the object directly\n",
+    "# to improve runtime performance. For example:\n",
+    "# chain_factory = lambda: agent"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07885b10",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### 4. Run the Agent and Evaluators\n",
+    "\n",
+    "With the dataset, agent, and evaluators selected, you can use the helper function below to run them all.\n",
+    "\n",
+    "The run traces and evaluation feedback will automatically be associated with the dataset for easy attribution and analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "3733269b-8085-4644-9d5d-baedcff13a2f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 15,
-      "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [],
-      "source": [
-        "from langchain.evaluation.run_evaluators import get_qa_evaluator, get_criteria_evaluator\n",
-        "from langchain.chat_models import ChatOpenAI\n",
-        "\n",
-        "eval_llm = ChatOpenAI(temperature=0)\n",
-        "\n",
-        "qa_evaluator = get_qa_evaluator(eval_llm)\n",
-        "helpfulness_evaluator = get_criteria_evaluator(eval_llm, \"helpfulness\")\n",
-        "conciseness_evaluator = get_criteria_evaluator(eval_llm, \"conciseness\")\n",
-        "custom_criteria_evaluator = get_criteria_evaluator(\n",
-        "    eval_llm,\n",
-        "    {\n",
-        "        \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n",
-        "    },\n",
-        ")\n",
-        "\n",
-        "evaluators = [\n",
-        "    qa_evaluator,\n",
-        "    helpfulness_evaluator,\n",
-        "    conciseness_evaluator,\n",
-        "    custom_criteria_evaluator,\n",
-        "]"
+     "data": {
+      "text/plain": [
+       "\u001b[0;31mSignature:\u001b[0m\n",
+       "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m    \u001b[0mrun_evaluators\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[Sequence[RunEvaluator]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
+       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+       "\u001b[0;31mDocstring:\u001b[0m\n",
+       "Asynchronously run the Chain or language model on a dataset\n",
+       "and store traces to the specified project name.\n",
+       "\n",
+       "Args:\n",
+       "    dataset_name: Name of the dataset to run the chain on.\n",
+       "    llm_or_chain_factory: Language model or Chain constructor to run\n",
+       "        over the dataset. The Chain constructor is used to permit\n",
+       "        independent calls on each example without carrying over state.\n",
+       "    concurrency_level: The number of async tasks to run concurrently.\n",
+       "    num_repetitions: Number of times to run the model on each example.\n",
+       "        This is useful when testing success rates or generating confidence\n",
+       "        intervals.\n",
+       "    project_name: Name of the project to store the traces in.\n",
+       "        Defaults to {dataset_name}-{chain class name}-{datetime}.\n",
+       "    verbose: Whether to print progress.\n",
+       "    client: Client to use to read the dataset. If not provided, a new\n",
+       "        client will be created using the credentials in the environment.\n",
+       "    tags: Tags to add to each run in the project.\n",
+       "    run_evaluators: Evaluators to run on the results of the chain.\n",
+       "\n",
+       "Returns:\n",
+       "    A dictionary containing the run's project name and the resulting model outputs.\n",
+       "\u001b[0;31mFile:\u001b[0m      ~/code/lc/lckg/langchain/client/runner_utils.py\n",
+       "\u001b[0;31mType:\u001b[0m      function"
       ]
-    },
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from langchain.client import (\n",
+    "    arun_on_dataset,\n",
+    "    run_on_dataset, # Available if your chain doesn't support async calls.\n",
+    ")\n",
+    "\n",
+    "?arun_on_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 16,
-      "id": "4c94a738-dcd3-442e-b8e7-dd36459f56e3",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5fce1ce42a8c4110b7d12443948ac697",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "0it [00:00, ?it/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "from tqdm.notebook import tqdm\n",
-        "\n",
-        "feedbacks = []\n",
-        "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1, error=False)\n",
-        "for run in tqdm(runs):\n",
-        "    if run.outputs is None:\n",
-        "        continue\n",
-        "    eval_feedback = []\n",
-        "    for evaluator in evaluators:\n",
-        "        eval_feedback.append(client.aevaluate_run(run, evaluator))\n",
-        "    feedbacks.extend(await asyncio.gather(*eval_feedback))"
-      ]
-    },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processed examples: 6\r"
+     ]
+    }
+   ],
+   "source": [
+    "chain_results = await arun_on_dataset(\n",
+    "    dataset_name=dataset_name,\n",
+    "    llm_or_chain_factory=agent_factory,\n",
+    "    concurrency_level=5,  # Optional, sets the number of examples to run at a time\n",
+    "    verbose=True,\n",
+    "    client=client,\n",
+    "    tags=[\n",
+    "        \"testing-notebook\",\n",
+    "    ],  # Optional, adds a tag to the resulting chain runs\n",
+    "    run_evaluators=evaluators,\n",
+    ")\n",
+    "\n",
+    "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n",
+    "# These are logged as warnings here and captured as errors in the tracing UI."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "### Review the Test Results\n",
+    "\n",
+    "You can review the test results tracing UI below by navigating to the Testing project \n",
+    "with the title that starts with **\"calculator-example-dataset-AgentExecutor-\"**\n",
+    "\n",
+    "This will show the new runs and the feedback logged from the selected evaluators."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "136db492-d6ca-4215-96f9-439c23538241",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 17,
-      "id": "8696f167-dc75-4ef8-8bb3-ac1ce8324f30",
-      "metadata": {
-        "tags": []
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
-            ],
-            "text/plain": [
-              "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
-            ]
-          },
-          "execution_count": 17,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<a href=\"https://dev.langchain.plus\", target=\"_blank\" rel=\"noopener\">LangChain+ Client</a>"
       ],
-      "source": [
-        "client"
+      "text/plain": [
+       "LangChainPlusClient (API URL: https://dev.api.langchain.plus)"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a5037e54-2c5a-4993-9b46-2a98773d3079",
-      "metadata": {},
-      "outputs": [],
-      "source": []
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
     }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3 (ipykernel)",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.11.3"
+   ],
+   "source": [
+    "# You can navigate to the UI by clicking on the link below\n",
+    "client"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f2c0539-09c1-42f9-a2ee-6a88a378d479",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "For a real production application, you will want to add many more test cases and\n",
+    "incorporate larger datasets to run benchmark evaluations to measure aggregate performance\n",
+    "across. For more information on recommended ways to do this, see [LangSmith Documentation](https://docs.langchain.plus/docs/)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd67201c-8dc1-4689-981c-759800749e25",
+   "metadata": {},
+   "source": [
+    "## Monitor\n",
+    "\n",
+    "Once your agent passed the selected quality bar, you can deploy it to production. For this notebook, you will simulate user interactions directly while logging your traces to LangSmith for monitoring.\n",
+    "\n",
+    "For more information on real production deployments, check out the [LangChain documentation](https://python.langchain.com/docs/guides/deployments/) or contact us at [support@langchain.dev](mailto:support@langchain.dev).\n",
+    "\n",
+    "**First, create a new project to use in your production deployment.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "3718710f-f719-4861-a351-0bb9d639d9fd",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "deployment_name = f\"Search + Calculator Deployment - {unique_id}\"\n",
+    "project = client.create_project(deployment_name, mode=\"monitor\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3a993ae7-6d26-495a-8633-64936bf94127",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "**Then, deploy your agent to production, making sure to configure the environment to log to the monitoring project.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "56dba20a-c07c-4b18-a4e7-834ab6dc87ef",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "agent = initialize_agent(\n",
+    "    tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "569389d4-b613-47ce-99d3-e0031f308185",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LLMMathChain._evaluate(\"\n",
+      "US_GDP / average_lifespan\n",
+      "\") raised error: 'US_GDP'. Please try again with a valid numerical expression\n"
+     ]
     }
+   ],
+   "source": [
+    "os.environ[\"LANGCHAIN_PROJECT\"] = deployment_name\n",
+    "\n",
+    "inputs = [\n",
+    "    \"What's the ratio of the current US GDP to the average lifespan of a human?\",\n",
+    "    \"What's sin of 180 degrees?\",\n",
+    "    \"I need help on my homework\",\n",
+    "    \"If the price of bushel of wheat increases by 10 cents, about how much will that impact the average cost of bread?\",\n",
+    "    # etc.\n",
+    "]\n",
+    "for query in inputs:\n",
+    "    try:\n",
+    "        await agent.arun(query)\n",
+    "    except Exception as e:\n",
+    "        print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2646f0fb-81d4-43ce-8a9b-54b8e19841e2",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "Congratulations! You have succesfully created connected an agent to LangSmith to trace and debug, evaluated it for accuracy, helpfulness, and trajectory efficiency over a dataset, and instrumented a monitoring project for a simulated \"production\" application!\n",
+    "\n",
+    "This was a quick guide to get started, but there are many more ways to use LangSmith to speed up your developer flow and produce better products.\n",
+    "\n",
+    "For more information on how you can get the most out of LangSmith, check out [LangSmith documentation](https://docs.langchain.plus/docs/),\n",
+    "\n",
+    "and please reach out with questions, feature requests, or feedback at [support@langchain.dev](mailto:support@langchain.dev)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90b7fbff-162d-4c9c-b6fc-33bd5445745f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 5
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/tests/unit_tests/client/test_runner_utils.py b/tests/unit_tests/client/test_runner_utils.py
index 162418f321..bada74b79c 100644
--- a/tests/unit_tests/client/test_runner_utils.py
+++ b/tests/unit_tests/client/test_runner_utils.py
@@ -169,8 +169,8 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None:
         example: Example,
         llm_or_chain: Union[BaseLanguageModel, Chain],
         n_repetitions: int,
-        tracer: Any,
         tags: Optional[List[str]] = None,
+        callbacks: Optional[Any] = None,
     ) -> List[Dict[str, Any]]:
         return [
             {"result": f"Result for example {example.id}"} for _ in range(n_repetitions)