From b4fe7f3a0995cc6a0111a7e71347eddf2d61f132 Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Fri, 23 Jun 2023 01:11:01 -0700 Subject: [PATCH] Session to project (#6249) Sessions are being renamed to projects in the tracer --- langchain/callbacks/manager.py | 22 +- langchain/callbacks/tracers/langchain.py | 8 +- langchain/client/runner_utils.py | 64 +- .../client/tracing_datasets.ipynb | 1296 +++++++++-------- poetry.lock | 10 +- pyproject.toml | 2 +- tests/unit_tests/client/test_runner_utils.py | 6 +- 7 files changed, 708 insertions(+), 700 deletions(-) diff --git a/langchain/callbacks/manager.py b/langchain/callbacks/manager.py index 1f73c029c0..3e899234e6 100644 --- a/langchain/callbacks/manager.py +++ b/langchain/callbacks/manager.py @@ -106,7 +106,7 @@ def wandb_tracing_enabled( @contextmanager def tracing_v2_enabled( - session_name: Optional[str] = None, + project_name: Optional[str] = None, *, example_id: Optional[Union[str, UUID]] = None, ) -> Generator[None, None, None]: @@ -120,7 +120,7 @@ def tracing_v2_enabled( example_id = UUID(example_id) cb = LangChainTracer( example_id=example_id, - session_name=session_name, + project_name=project_name, ) tracing_v2_callback_var.set(cb) yield @@ -131,12 +131,12 @@ def tracing_v2_enabled( def trace_as_chain_group( group_name: str, *, - session_name: Optional[str] = None, + project_name: Optional[str] = None, example_id: Optional[Union[str, UUID]] = None, ) -> Generator[CallbackManager, None, None]: """Get a callback manager for a chain group in a context manager.""" cb = LangChainTracer( - session_name=session_name, + project_name=project_name, example_id=example_id, ) cm = CallbackManager.configure( @@ -152,12 +152,12 @@ def trace_as_chain_group( async def atrace_as_chain_group( group_name: str, *, - session_name: Optional[str] = None, + project_name: Optional[str] = None, example_id: Optional[Union[str, UUID]] = None, ) -> AsyncGenerator[AsyncCallbackManager, None]: """Get a callback manager for a chain group in a context manager.""" cb = LangChainTracer( - session_name=session_name, + project_name=project_name, example_id=example_id, ) cm = AsyncCallbackManager.configure( @@ -1039,10 +1039,10 @@ def _configure( tracing_v2_enabled_ = ( env_var_is_set("LANGCHAIN_TRACING_V2") or tracer_v2 is not None ) - tracer_session = os.environ.get("LANGCHAIN_SESSION") + tracer_project = os.environ.get( + "LANGCHAIN_PROJECT", os.environ.get("LANGCHAIN_SESSION", "default") + ) debug = _get_debug() - if tracer_session is None: - tracer_session = "default" if ( verbose or debug @@ -1072,7 +1072,7 @@ def _configure( callback_manager.add_handler(tracer, True) else: handler = LangChainTracerV1() - handler.load_session(tracer_session) + handler.load_session(tracer_project) callback_manager.add_handler(handler, True) if wandb_tracing_enabled_ and not any( isinstance(handler, WandbTracer) for handler in callback_manager.handlers @@ -1090,7 +1090,7 @@ def _configure( callback_manager.add_handler(tracer_v2, True) else: try: - handler = LangChainTracer(session_name=tracer_session) + handler = LangChainTracer(project_name=tracer_project) callback_manager.add_handler(handler, True) except Exception as e: logger.warning( diff --git a/langchain/callbacks/tracers/langchain.py b/langchain/callbacks/tracers/langchain.py index 48d6164c38..394857b570 100644 --- a/langchain/callbacks/tracers/langchain.py +++ b/langchain/callbacks/tracers/langchain.py @@ -45,7 +45,7 @@ class LangChainTracer(BaseTracer): def __init__( self, example_id: Optional[Union[UUID, str]] = None, - session_name: Optional[str] = None, + project_name: Optional[str] = None, client: Optional[LangChainPlusClient] = None, **kwargs: Any, ) -> None: @@ -55,7 +55,9 @@ class LangChainTracer(BaseTracer): self.example_id = ( UUID(example_id) if isinstance(example_id, str) else example_id ) - self.session_name = session_name or os.getenv("LANGCHAIN_SESSION", "default") + self.project_name = project_name or os.getenv( + "LANGCHAIN_PROJECT", os.getenv("LANGCHAIN_SESSION", "default") + ) # set max_workers to 1 to process tasks in order self.executor = ThreadPoolExecutor(max_workers=1) self.client = client or LangChainPlusClient() @@ -103,7 +105,7 @@ class LangChainTracer(BaseTracer): extra["runtime"] = get_runtime_environment() run_dict["extra"] = extra try: - self.client.create_run(**run_dict, session_name=self.session_name) + self.client.create_run(**run_dict, project_name=self.project_name) except Exception as e: # Errors are swallowed by the thread executor so we need to log them here log_error_once("post", e) diff --git a/langchain/client/runner_utils.py b/langchain/client/runner_utils.py index 5a7d6d42b6..a2b58aa6a3 100644 --- a/langchain/client/runner_utils.py +++ b/langchain/client/runner_utils.py @@ -237,18 +237,18 @@ async def _gather_with_concurrency( return results -async def _tracer_initializer(session_name: Optional[str]) -> Optional[LangChainTracer]: +async def _tracer_initializer(project_name: Optional[str]) -> Optional[LangChainTracer]: """ Initialize a tracer to share across tasks. Args: - session_name: The session name for the tracer. + project_name: The project name for the tracer. Returns: - A LangChainTracer instance with an active session. + A LangChainTracer instance with an active project. """ - if session_name: - tracer = LangChainTracer(session_name=session_name) + if project_name: + tracer = LangChainTracer(project_name=project_name) return tracer else: return None @@ -260,12 +260,12 @@ async def arun_on_examples( *, concurrency_level: int = 5, num_repetitions: int = 1, - session_name: Optional[str] = None, + project_name: Optional[str] = None, verbose: bool = False, tags: Optional[List[str]] = None, ) -> Dict[str, Any]: """ - Run the chain on examples and store traces to the specified session name. + Run the chain on examples and store traces to the specified project name. Args: examples: Examples to run the model or chain over @@ -276,7 +276,7 @@ async def arun_on_examples( num_repetitions: Number of times to run the model on each example. This is useful when testing success rates or generating confidence intervals. - session_name: Session name to use when tracing runs. + project_name: Project name to use when tracing runs. verbose: Whether to print progress. tags: Tags to add to the traces. @@ -307,7 +307,7 @@ async def arun_on_examples( await _gather_with_concurrency( concurrency_level, - functools.partial(_tracer_initializer, session_name), + functools.partial(_tracer_initializer, project_name), *(functools.partial(process_example, e) for e in examples), ) return results @@ -386,11 +386,11 @@ def run_on_examples( llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, *, num_repetitions: int = 1, - session_name: Optional[str] = None, + project_name: Optional[str] = None, verbose: bool = False, tags: Optional[List[str]] = None, ) -> Dict[str, Any]: - """Run the chain on examples and store traces to the specified session name. + """Run the chain on examples and store traces to the specified project name. Args: examples: Examples to run model or chain over. @@ -401,14 +401,14 @@ def run_on_examples( num_repetitions: Number of times to run the model on each example. This is useful when testing success rates or generating confidence intervals. - session_name: Session name to use when tracing runs. + project_name: Project name to use when tracing runs. verbose: Whether to print progress. tags: Tags to add to the run traces. Returns: A dictionary mapping example ids to the model outputs. """ results: Dict[str, Any] = {} - tracer = LangChainTracer(session_name=session_name) if session_name else None + tracer = LangChainTracer(project_name=project_name) if project_name else None for i, example in enumerate(examples): result = run_llm_or_chain( example, @@ -425,13 +425,13 @@ def run_on_examples( return results -def _get_session_name( - session_name: Optional[str], +def _get_project_name( + project_name: Optional[str], llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, dataset_name: str, ) -> str: - if session_name is not None: - return session_name + if project_name is not None: + return project_name current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") if isinstance(llm_or_chain_factory, BaseLanguageModel): model_name = llm_or_chain_factory.__class__.__name__ @@ -446,13 +446,13 @@ async def arun_on_dataset( *, concurrency_level: int = 5, num_repetitions: int = 1, - session_name: Optional[str] = None, + project_name: Optional[str] = None, verbose: bool = False, client: Optional[LangChainPlusClient] = None, tags: Optional[List[str]] = None, ) -> Dict[str, Any]: """ - Run the chain on a dataset and store traces to the specified session name. + Run the chain on a dataset and store traces to the specified project name. Args: client: Client to use to read the dataset. @@ -464,7 +464,7 @@ async def arun_on_dataset( num_repetitions: Number of times to run the model on each example. This is useful when testing success rates or generating confidence intervals. - session_name: Name of the session to store the traces in. + project_name: Name of the project to store the traces in. Defaults to {dataset_name}-{chain class name}-{datetime}. verbose: Whether to print progress. client: Client to use to read the dataset. If not provided, a new @@ -472,11 +472,10 @@ async def arun_on_dataset( tags: Tags to add to each run in the sesssion. Returns: - A dictionary containing the run's session name and the resulting model outputs. + A dictionary containing the run's project name and the resulting model outputs. """ client_ = client or LangChainPlusClient() - session_name = _get_session_name(session_name, llm_or_chain_factory, dataset_name) - client_.create_session(session_name, mode="eval") + project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name) dataset = client_.read_dataset(dataset_name=dataset_name) examples = client_.list_examples(dataset_id=str(dataset.id)) @@ -485,12 +484,12 @@ async def arun_on_dataset( llm_or_chain_factory, concurrency_level=concurrency_level, num_repetitions=num_repetitions, - session_name=session_name, + project_name=project_name, verbose=verbose, tags=tags, ) return { - "session_name": session_name, + "project_name": project_name, "results": results, } @@ -500,12 +499,12 @@ def run_on_dataset( llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, *, num_repetitions: int = 1, - session_name: Optional[str] = None, + project_name: Optional[str] = None, verbose: bool = False, client: Optional[LangChainPlusClient] = None, tags: Optional[List[str]] = None, ) -> Dict[str, Any]: - """Run the chain on a dataset and store traces to the specified session name. + """Run the chain on a dataset and store traces to the specified project name. Args: dataset_name: Name of the dataset to run the chain on. @@ -516,7 +515,7 @@ def run_on_dataset( num_repetitions: Number of times to run the model on each example. This is useful when testing success rates or generating confidence intervals. - session_name: Name of the session to store the traces in. + project_name: Name of the project to store the traces in. Defaults to {dataset_name}-{chain class name}-{datetime}. verbose: Whether to print progress. client: Client to use to access the dataset. If None, a new client @@ -524,22 +523,21 @@ def run_on_dataset( tags: Tags to add to each run in the sesssion. Returns: - A dictionary containing the run's session name and the resulting model outputs. + A dictionary containing the run's project name and the resulting model outputs. """ client_ = client or LangChainPlusClient() - session_name = _get_session_name(session_name, llm_or_chain_factory, dataset_name) - client_.create_session(session_name, mode="eval") + project_name = _get_project_name(project_name, llm_or_chain_factory, dataset_name) dataset = client_.read_dataset(dataset_name=dataset_name) examples = client_.list_examples(dataset_id=str(dataset.id)) results = run_on_examples( examples, llm_or_chain_factory, num_repetitions=num_repetitions, - session_name=session_name, + project_name=project_name, verbose=verbose, tags=tags, ) return { - "session_name": session_name, + "project_name": project_name, "results": results, } diff --git a/langchain/experimental/client/tracing_datasets.ipynb b/langchain/experimental/client/tracing_datasets.ipynb index 4c5025b716..f3929c34f6 100644 --- a/langchain/experimental/client/tracing_datasets.ipynb +++ b/langchain/experimental/client/tracing_datasets.ipynb @@ -1,669 +1,677 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "1a4596ea-a631-416d-a2a4-3577c140493d", - "metadata": { - "tags": [] - }, - "source": [ - "# Tracing and Datasets with LangChainPlus\n", - "\n", - "LangChain makes it easy to get started with Agents and other LLM applications. However, it can be tricky to get right, especially when you need to deliver a full product. To speed up your application development process, and to help monitor your applications in production, LangChain offers additional tracing and tooling.\n", - "\n", - "When might you want to use tracing? Some situations we've found it useful include:\n", - "- Quickly debugging a new chain, agent, or set of tools\n", - "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n", - "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n", - "\n", - "\n", - "In this notebook, we'll show how to enable tracing in your LangChain applications and walk you a couple common ways to evaluate your agents.\n", - "We'll focus on using Datasets to benchmark Chain behavior.\n", - "\n", - "**Bear in mind that this notebook is designed under the assumption that you're running the latest LangChain+ server locally in the background. This is done using the folowing command in your terminal:**\n", - "\n", - "\n", - "```\n", - "pip install --upgrade langchain\n", - "langchain plus start\n", - "```\n", - "\n", - "We also have a hosted version which is in private beta. We will share more details as it progresses.\n", - "\n", - "Now, let's get started by creating a client to connect to LangChain+." - ] - }, - { - "cell_type": "markdown", - "id": "2d77d064-41b4-41fb-82e6-2d16461269ec", - "metadata": { - "tags": [] - }, - "source": [ - "## Setting up Tracing\n", - "\n", - "The V2 tracing API can be activated by setting the `LANGCHAIN_TRACING_V2` environment variable to true. Assuming you've successfully initiated the server as described earlier, running LangChain Agents, Chains, LLMs, and other primitives will automatically start capturing traces. Let's begin our exploration with a straightforward math example.\n", - "\n", - "**NOTE**: You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n" - ] - }, - { - "cell_type": "markdown", - "id": "7935e832-9ae1-4557-8d08-890c425f18e2", - "metadata": {}, - "source": [ - "**NOTE:** You can also use the `tracing_v2_enabled` context manager to capture sessions within a given context:\n", - "```\n", - "from langchain.callbacks.manager import tracing_v2_enabled\n", - "with tracing_v2_enabled(\"My Session Name\"):\n", - " ...\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "221b638a-2ae4-46ef-bf6a-d59bf85d587f", - "metadata": {}, - "source": [ - "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "904db9a5-f387-4a57-914c-c8af8d39e249", - "metadata": { - "tags": [] - }, - "outputs": [ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1a4596ea-a631-416d-a2a4-3577c140493d", + "metadata": { + "tags": [] + }, + "source": [ + "# Tracing and Datasets with LangChainPlus\n", + "\n", + "LangChain makes it easy to get started with Agents and other LLM applications. However, it can be tricky to get right, especially when you need to deliver a full product. To speed up your application development process, and to help monitor your applications in production, LangChain offers additional tracing and tooling.\n", + "\n", + "When might you want to use tracing? Some situations we've found it useful include:\n", + "- Quickly debugging a new chain, agent, or set of tools\n", + "- Evaluating a given chain across different LLMs or Chat Models to compare results or improve prompts\n", + "- Running a given chain multiple time on a dataset to ensure it consistently meets a quality bar.\n", + "\n", + "\n", + "In this notebook, we'll show how to enable tracing in your LangChain applications and walk you a couple common ways to evaluate your agents.\n", + "We'll focus on using Datasets to benchmark Chain behavior.\n", + "\n", + "**Bear in mind that this notebook is designed under the assumption that you're running the latest LangChain+ server locally in the background. This is done using the folowing command in your terminal:**\n", + "\n", + "\n", + "```\n", + "pip install --upgrade langchain\n", + "langchain plus start\n", + "```\n", + "\n", + "We also have a hosted version which is in private beta. We will share more details as it progresses.\n", + "\n", + "Now, let's get started by creating a client to connect to LangChain+." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2d77d064-41b4-41fb-82e6-2d16461269ec", + "metadata": { + "tags": [] + }, + "source": [ + "## Setting up Tracing\n", + "\n", + "The V2 tracing API can be activated by setting the `LANGCHAIN_TRACING_V2` environment variable to true. Assuming you've successfully initiated the server as described earlier, running LangChain Agents, Chains, LLMs, and other primitives will automatically start capturing traces. Let's begin our exploration with a straightforward math example.\n", + "\n", + "**NOTE**: You must also set your `OPENAI_API_KEY` and `SERPAPI_API_KEY` environment variables in order to run the following tutorial.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7935e832-9ae1-4557-8d08-890c425f18e2", + "metadata": {}, + "source": [ + "**NOTE:** You can also use the `tracing_v2_enabled` context manager to capture projects within a given context:\n", + "```\n", + "from langchain.callbacks.manager import tracing_v2_enabled\n", + "with tracing_v2_enabled(\"My Project Name\"):\n", + " ...\n", + "```" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "You can click the link below to view the UI\n" - ] + "attachments": {}, + "cell_type": "markdown", + "id": "221b638a-2ae4-46ef-bf6a-d59bf85d587f", + "metadata": {}, + "source": [ + "**NOTE:** You can optionally set the `LANGCHAIN_ENDPOINT` and `LANGCHAIN_API_KEY` environment variables if using the hosted version which is in private beta." + ] }, { - "data": { - "text/html": [ - "LangChain+ Client" + "cell_type": "code", + "execution_count": 1, + "id": "904db9a5-f387-4a57-914c-c8af8d39e249", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You can click the link below to view the UI\n" + ] + }, + { + "data": { + "text/html": [ + "LangChain+ Client" + ], + "text/plain": [ + "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + "source": [ + "import os\n", + "from langchainplus_sdk import LangChainPlusClient\n", + "\n", + "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", + "os.environ[\"LANGCHAIN_PROJECT\"] = \"Tracing Walkthrough\"\n", + "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\" # Uncomment this line if you want to use the hosted version\n", + "# os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Uncomment this line if you want to use the hosted version.\n", + "\n", + "client = LangChainPlusClient()\n", + "print(\"You can click the link below to view the UI\")\n", + "client" ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "from langchainplus_sdk import LangChainPlusClient\n", - "\n", - "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", - "os.environ[\"LANGCHAIN_SESSION\"] = \"Tracing Walkthrough\"\n", - "# os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\" # Uncomment this line if you want to use the hosted version\n", - "# os.environ[\"LANGCHAIN_API_KEY\"] = \"\" # Uncomment this line if you want to use the hosted version.\n", - "\n", - "client = LangChainPlusClient()\n", - "print(\"You can click the link below to view the UI\")\n", - "client" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "7c801853-8e96-404d-984c-51ace59cbbef", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.agents import initialize_agent, load_tools\n", - "from langchain.agents import AgentType\n", - "\n", - "llm = ChatOpenAI(temperature=0)\n", - "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", - "agent = initialize_agent(\n", - " tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "19537902-b95c-4390-80a4-f6c9a937081e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import asyncio\n", - "\n", - "inputs = [\n", - " \"How many people live in canada as of 2023?\",\n", - " \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n", - " \"what is dua lipa's boyfriend age raised to the .43 power?\",\n", - " \"how far is it from paris to boston in miles\",\n", - " \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n", - " \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n", - " \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n", - " \"what is 153 raised to .1312 power?\",\n", - " \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n", - " \"what is 1213 divided by 4345?\",\n", - "]\n", - "results = []\n", - "\n", - "\n", - "async def arun(agent, input_example):\n", - " try:\n", - " return await agent.arun(input_example)\n", - " except Exception as e:\n", - " # The agent sometimes makes mistakes! These will be captured by the tracing.\n", - " return e\n", - "\n", - "\n", - "for input_example in inputs:\n", - " results.append(arun(agent, input_example))\n", - "results = await asyncio.gather(*results)" - ] - }, - { - "cell_type": "markdown", - "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430", - "metadata": {}, - "source": [ - "## Creating the Dataset\n", - "\n", - "Now that you've captured a session entitled 'Tracing Walkthrough', it's time to create a dataset. We will do so using the `create_dataset` method below." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dataset_name = \"calculator-example-dataset\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n", - " client.delete_dataset(dataset_name=dataset_name)\n", - "dataset = client.create_dataset(\n", - " dataset_name, description=\"A calculator example dataset\"\n", - ")\n", - "runs = client.list_runs(\n", - " session_name=os.environ[\"LANGCHAIN_SESSION\"],\n", - " execution_order=1, # Only return the top-level runs\n", - " error=False, # Only runs that succeed\n", - ")\n", - "for run in runs:\n", - " if run.outputs is None:\n", - " continue\n", - " try:\n", - " client.create_example(\n", - " inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n", - " )\n", - " except:\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, - "source": [ - "**Alternative: Creating a Dataset in the UI** \n", - "\n", - "Alternatively, you could create or edit the dataset in the UI using the following steps:\n", - "\n", - " 1. Navigate to the UI by clicking on the link below.\n", - " 2. Select the 'search_and_math_chain' session from the list.\n", - " 3. Next to the fist example, click \"+ to Dataset\".\n", - " 4. Click \"Create Dataset\" and create a title **\"calculator-example-dataset\"**.\n", - " 5. Add the other examples to the dataset as well\n", - "\n", - "Once you've used LangChain+ for a while, you will have a number of datasets to work with. To view all saved datasets, execute the following code:\n", - "\n", - "```\n", - "datasets = client.list_datasets()\n", - "print(datasets)\n", - "```\n", - "\n", - "\n", - "**Optional:** If you didn't run the trace above, you can also create datasets by uploading dataframes or CSV files." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1baa677c-5642-4378-8e01-3aa1647f19d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# !pip install datasets > /dev/null\n", - "# !pip install pandas > /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "60d14593-c61f-449f-a38f-772ca43707c2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "# from langchain.evaluation.loading import load_dataset\n", - "\n", - "# dataset = load_dataset(\"agent-search-calculator\")\n", - "# df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n", - "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key\n", - "# df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "52a7ea76-79ca-4765-abf7-231e884040d6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# dataset_name = \"calculator-example-dataset\"\n", - "\n", - "# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n", - "# dataset = client.upload_dataframe(df,\n", - "# name=dataset_name,\n", - "# description=\"A calculator example dataset\",\n", - "# input_keys=[\"input\"],\n", - "# output_keys=[\"output\"],\n", - "# )" - ] - }, - { - "cell_type": "markdown", - "id": "07885b10", - "metadata": { - "tags": [] - }, - "source": [ - "## Running a Chain on a Traced Dataset\n", - "\n", - "Once you have a dataset, you can run a compatible chain or other object over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n", - "\n", - "**First, we'll define the chain we wish to run over the dataset.**\n", - "\n", - "In this case, we're using an agent, but it can be any simple chain." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.agents import initialize_agent, load_tools\n", - "from langchain.agents import AgentType\n", - "\n", - "llm = ChatOpenAI(temperature=0)\n", - "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", - "agent = initialize_agent(\n", - " tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "84094a4a-1d76-461c-bc37-8c537939b466", - "metadata": {}, - "source": [ - "**Now we're ready to run the chain!**\n", - "\n", - "The docstring below hints ways you can configure the method to run." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "112d7bdf-7e50-4c1a-9285-5bac8473f2ee", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "\u001b[0;31mSignature:\u001b[0m\n", - "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0msession_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", - "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mDocstring:\u001b[0m\n", - "Run the chain on a dataset and store traces to the specified session name.\n", - "\n", - "Args:\n", - " client: Client to use to read the dataset.\n", - " dataset_name: Name of the dataset to run the chain on.\n", - " llm_or_chain_factory: Language model or Chain constructor to run\n", - " over the dataset. The Chain constructor is used to permit\n", - " independent calls on each example without carrying over state.\n", - " concurrency_level: The number of async tasks to run concurrently.\n", - " num_repetitions: Number of times to run the model on each example.\n", - " This is useful when testing success rates or generating confidence\n", - " intervals.\n", - " session_name: Name of the session to store the traces in.\n", - " Defaults to {dataset_name}-{chain class name}-{datetime}.\n", - " verbose: Whether to print progress.\n", - " client: Client to use to read the dataset. If not provided, a new\n", - " client will be created using the credentials in the environment.\n", - " tags: Tags to add to each run in the sesssion.\n", - "\n", - "Returns:\n", - " A dictionary containing the run's session name and the resulting model outputs.\n", - "\u001b[0;31mFile:\u001b[0m ~/code/lc/lckg/langchain/client/runner_utils.py\n", - "\u001b[0;31mType:\u001b[0m function" + "cell_type": "code", + "execution_count": 2, + "id": "7c801853-8e96-404d-984c-51ace59cbbef", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.agents import initialize_agent, load_tools\n", + "from langchain.agents import AgentType\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", + "agent = initialize_agent(\n", + " tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n", + ")" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.client import arun_on_dataset\n", - "\n", - "?arun_on_dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "6e10f823", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Since chains can be stateful (e.g. they can have memory), we need provide\n", - "# a way to initialize a new chain for each row in the dataset. This is done\n", - "# by passing in a factory function that returns a new chain for each row.\n", - "chain_factory = lambda: initialize_agent(\n", - " tools,\n", - " llm,\n", - " agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n", - " verbose=False,\n", - ")\n", - "\n", - "# If your chain is NOT stateful, your lambda can return the object directly\n", - "# to improve runtime performance. For example:\n", - "# chain_factory = lambda: agent" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed examples: 1\r" - ] + "cell_type": "code", + "execution_count": 3, + "id": "19537902-b95c-4390-80a4-f6c9a937081e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import asyncio\n", + "\n", + "inputs = [\n", + " \"How many people live in canada as of 2023?\",\n", + " \"who is dua lipa's boyfriend? what is his age raised to the .43 power?\",\n", + " \"what is dua lipa's boyfriend age raised to the .43 power?\",\n", + " \"how far is it from paris to boston in miles\",\n", + " \"what was the total number of points scored in the 2023 super bowl? what is that number raised to the .23 power?\",\n", + " \"what was the total number of points scored in the 2023 super bowl raised to the .23 power?\",\n", + " \"how many more points were scored in the 2023 super bowl than in the 2022 super bowl?\",\n", + " \"what is 153 raised to .1312 power?\",\n", + " \"who is kendall jenner's boyfriend? what is his height (in inches) raised to .13 power?\",\n", + " \"what is 1213 divided by 4345?\",\n", + "]\n", + "results = []\n", + "\n", + "\n", + "async def arun(agent, input_example):\n", + " try:\n", + " return await agent.arun(input_example)\n", + " except Exception as e:\n", + " # The agent sometimes makes mistakes! These will be captured by the tracing.\n", + " return e\n", + "\n", + "\n", + "for input_example in inputs:\n", + " results.append(arun(agent, input_example))\n", + "results = await asyncio.gather(*results)" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "Chain failed for example b36a82d3-4fb6-4bc4-87df-b7c355742b8e. Error: unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.\n" - ] + "attachments": {}, + "cell_type": "markdown", + "id": "6c43c311-4e09-4d57-9ef3-13afb96ff430", + "metadata": {}, + "source": [ + "## Creating the Dataset\n", + "\n", + "Now that you've captured a project entitled 'Tracing Walkthrough', it's time to create a dataset. We will do so using the `create_dataset` method below." + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processed examples: 6\r" - ] - } - ], - "source": [ - "chain_results = await arun_on_dataset(\n", - " dataset_name=dataset_name,\n", - " llm_or_chain_factory=chain_factory,\n", - " concurrency_level=5, # Optional, sets the number of examples to run at a time\n", - " verbose=True,\n", - " client=client,\n", - " tags=[\n", - " \"testing-notebook\",\n", - " \"turbo\",\n", - " ], # Optional, adds a tag to the resulting chain runs\n", - ")\n", - "\n", - "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n", - "# These are logged as warnings here and captured as errors in the tracing UI." - ] - }, - { - "cell_type": "markdown", - "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4", - "metadata": { - "tags": [] - }, - "source": [ - "### Reviewing the Chain Results\n", - "\n", - "You can review the results of the run in the tracing UI below and navigating to the session \n", - "with the title **\"Search + Calculator Agent Evaluation\"**" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "136db492-d6ca-4215-96f9-439c23538241", - "metadata": { - "tags": [] - }, - "outputs": [ + "cell_type": "code", + "execution_count": 4, + "id": "d14a9881-2a01-404c-8c56-0b78565c3ff4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "dataset_name = \"calculator-example-dataset\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "17580c4b-bd04-4dde-9d21-9d4edd25b00d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "if dataset_name in set([dataset.name for dataset in client.list_datasets()]):\n", + " client.delete_dataset(dataset_name=dataset_name)\n", + "dataset = client.create_dataset(\n", + " dataset_name, description=\"A calculator example dataset\"\n", + ")\n", + "runs = client.list_runs(\n", + " project_name=os.environ[\"LANGCHAIN_PROJECT\"],\n", + " execution_order=1, # Only return the top-level runs\n", + " error=False, # Only runs that succeed\n", + ")\n", + "for run in runs:\n", + " if run.outputs is None:\n", + " continue\n", + " try:\n", + " client.create_example(\n", + " inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n", + " )\n", + " except:\n", + " pass" + ] + }, { - "data": { - "text/html": [ - "LangChain+ Client" + "attachments": {}, + "cell_type": "markdown", + "id": "db79dea2-fbaa-4c12-9083-f6154b51e2d3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "**Alternative: Creating a Dataset in the UI** \n", + "\n", + "Alternatively, you could create or edit the dataset in the UI using the following steps:\n", + "\n", + " 1. Navigate to the UI by clicking on the link below.\n", + " 2. Select the 'search_and_math_chain' project from the list.\n", + " 3. Next to the fist example, click \"+ to Dataset\".\n", + " 4. Click \"Create Dataset\" and create a title **\"calculator-example-dataset\"**.\n", + " 5. Add the other examples to the dataset as well\n", + "\n", + "Once you've used LangChain+ for a while, you will have a number of datasets to work with. To view all saved datasets, execute the following code:\n", + "\n", + "```\n", + "datasets = client.list_datasets()\n", + "print(datasets)\n", + "```\n", + "\n", + "\n", + "**Optional:** If you didn't run the trace above, you can also create datasets by uploading dataframes or CSV files." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1baa677c-5642-4378-8e01-3aa1647f19d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install datasets > /dev/null\n", + "# !pip install pandas > /dev/null" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60d14593-c61f-449f-a38f-772ca43707c2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# import pandas as pd\n", + "# from langchain.evaluation.loading import load_dataset\n", + "\n", + "# dataset = load_dataset(\"agent-search-calculator\")\n", + "# df = pd.DataFrame(dataset, columns=[\"question\", \"answer\"])\n", + "# df.columns = [\"input\", \"output\"] # The chain we want to evaluate below expects inputs with the \"input\" key\n", + "# df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "52a7ea76-79ca-4765-abf7-231e884040d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# dataset_name = \"calculator-example-dataset\"\n", + "\n", + "# if dataset_name not in set([dataset.name for dataset in client.list_datasets()]):\n", + "# dataset = client.upload_dataframe(df,\n", + "# name=dataset_name,\n", + "# description=\"A calculator example dataset\",\n", + "# input_keys=[\"input\"],\n", + "# output_keys=[\"output\"],\n", + "# )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "07885b10", + "metadata": { + "tags": [] + }, + "source": [ + "## Running a Chain on a Traced Dataset\n", + "\n", + "Once you have a dataset, you can run a compatible chain or other object over it to see its results. The run traces will automatically be associated with the dataset for easy attribution and analysis.\n", + "\n", + "**First, we'll define the chain we wish to run over the dataset.**\n", + "\n", + "In this case, we're using an agent, but it can be any simple chain." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c2b59104-b90e-466a-b7ea-c5bd0194263b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.agents import initialize_agent, load_tools\n", + "from langchain.agents import AgentType\n", + "\n", + "llm = ChatOpenAI(temperature=0)\n", + "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", + "agent = initialize_agent(\n", + " tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "84094a4a-1d76-461c-bc37-8c537939b466", + "metadata": {}, + "source": [ + "**Now we're ready to run the chain!**\n", + "\n", + "The docstring below hints ways you can configure the method to run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "112d7bdf-7e50-4c1a-9285-5bac8473f2ee", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "\u001b[0;31mSignature:\u001b[0m\n", + "\u001b[0marun_on_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mdataset_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mllm_or_chain_factory\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'MODEL_OR_CHAIN_FACTORY'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mconcurrency_level\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mnum_repetitions\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mproject_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mclient\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[LangChainPlusClient]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m \u001b[0mtags\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Optional[List[str]]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", + "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'Dict[str, Any]'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mDocstring:\u001b[0m\n", + "Run the chain on a dataset and store traces to the specified project name.\n", + "\n", + "Args:\n", + " client: Client to use to read the dataset.\n", + " dataset_name: Name of the dataset to run the chain on.\n", + " llm_or_chain_factory: Language model or Chain constructor to run\n", + " over the dataset. The Chain constructor is used to permit\n", + " independent calls on each example without carrying over state.\n", + " concurrency_level: The number of async tasks to run concurrently.\n", + " num_repetitions: Number of times to run the model on each example.\n", + " This is useful when testing success rates or generating confidence\n", + " intervals.\n", + " project_name: Name of the project to store the traces in.\n", + " Defaults to {dataset_name}-{chain class name}-{datetime}.\n", + " verbose: Whether to print progress.\n", + " client: Client to use to read the dataset. If not provided, a new\n", + " client will be created using the credentials in the environment.\n", + " tags: Tags to add to each run in the sesssion.\n", + "\n", + "Returns:\n", + " A dictionary containing the run's project name and the resulting model outputs.\n", + "\u001b[0;31mFile:\u001b[0m ~/code/lc/lckg/langchain/client/runner_utils.py\n", + "\u001b[0;31mType:\u001b[0m function" + ] + }, + "metadata": {}, + "output_type": "display_data" + } ], - "text/plain": [ - "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + "source": [ + "from langchain.client import arun_on_dataset\n", + "\n", + "?arun_on_dataset" ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# You can navigate to the UI by clicking on the link below\n", - "client" - ] - }, - { - "cell_type": "markdown", - "id": "63ed6561-6574-43b3-a653-fe410aa8a617", - "metadata": {}, - "source": [ - "## Running an Evaluation Chain\n", - "\n", - "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n", - "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n", - "\n", - "A few ways of doing this include:\n", - "- Adding ground-truth answers as outputs to the dataset and evaluating relative to those references.\n", - "- Evaluating the overall agent trajectory based on the tool usage and intermediate steps.\n", - "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n", - "- Evaluating 'aspects' of the agent's response in a reference-free manner using targeted agent prompts.\n", - " \n", - "Below, we show how to run an evaluation chain that compares the model output with the ground-truth answers.\n", - "\n", - "**Note: the feedback API is currently experimental and subject to change.**" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.evaluation.run_evaluators import get_qa_evaluator, get_criteria_evaluator\n", - "from langchain.chat_models import ChatOpenAI\n", - "\n", - "eval_llm = ChatOpenAI(temperature=0)\n", - "\n", - "qa_evaluator = get_qa_evaluator(eval_llm)\n", - "helpfulness_evaluator = get_criteria_evaluator(eval_llm, \"helpfulness\")\n", - "conciseness_evaluator = get_criteria_evaluator(eval_llm, \"conciseness\")\n", - "custom_criteria_evaluator = get_criteria_evaluator(\n", - " eval_llm,\n", - " {\n", - " \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n", - " },\n", - ")\n", - "\n", - "evaluators = [\n", - " qa_evaluator,\n", - " helpfulness_evaluator,\n", - " conciseness_evaluator,\n", - " custom_criteria_evaluator,\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "4c94a738-dcd3-442e-b8e7-dd36459f56e3", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5fce1ce42a8c4110b7d12443948ac697", - "version_major": 2, - "version_minor": 0 + "cell_type": "code", + "execution_count": 11, + "id": "6e10f823", + "metadata": { + "tags": [] }, - "text/plain": [ - "0it [00:00, ?it/s]" + "outputs": [], + "source": [ + "# Since chains can be stateful (e.g. they can have memory), we need provide\n", + "# a way to initialize a new chain for each row in the dataset. This is done\n", + "# by passing in a factory function that returns a new chain for each row.\n", + "chain_factory = lambda: initialize_agent(\n", + " tools,\n", + " llm,\n", + " agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n", + " verbose=False,\n", + ")\n", + "\n", + "# If your chain is NOT stateful, your lambda can return the object directly\n", + "# to improve runtime performance. For example:\n", + "# chain_factory = lambda: agent" ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from tqdm.notebook import tqdm\n", - "\n", - "feedbacks = []\n", - "runs = client.list_runs(\n", - " session_name=chain_results[\"session_name\"], execution_order=1, error=False\n", - ")\n", - "for run in tqdm(runs):\n", - " if run.outputs is None:\n", - " continue\n", - " eval_feedback = []\n", - " for evaluator in evaluators:\n", - " eval_feedback.append(client.aevaluate_run(run, evaluator))\n", - " feedbacks.extend(await asyncio.gather(*eval_feedback))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "8696f167-dc75-4ef8-8bb3-ac1ce8324f30", - "metadata": { - "tags": [] - }, - "outputs": [ + }, { - "data": { - "text/html": [ - "LangChain+ Client" + "cell_type": "code", + "execution_count": 13, + "id": "a8088b7d-3ab6-4279-94c8-5116fe7cee33", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processed examples: 1\r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Chain failed for example b36a82d3-4fb6-4bc4-87df-b7c355742b8e. Error: unknown format from LLM: Sorry, I cannot answer this question as it requires information that is not currently available.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processed examples: 6\r" + ] + } ], - "text/plain": [ - "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + "source": [ + "chain_results = await arun_on_dataset(\n", + " dataset_name=dataset_name,\n", + " llm_or_chain_factory=chain_factory,\n", + " concurrency_level=5, # Optional, sets the number of examples to run at a time\n", + " verbose=True,\n", + " client=client,\n", + " tags=[\n", + " \"testing-notebook\",\n", + " \"turbo\",\n", + " ], # Optional, adds a tag to the resulting chain runs\n", + ")\n", + "\n", + "# Sometimes, the agent will error due to parsing issues, incompatible tool inputs, etc.\n", + "# These are logged as warnings here and captured as errors in the tracing UI." ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cdacd159-eb4d-49e9-bb2a-c55322c40ed4", + "metadata": { + "tags": [] + }, + "source": [ + "### Reviewing the Chain Results\n", + "\n", + "You can review the results of the run in the tracing UI below and navigating to the project \n", + "with the title **\"Search + Calculator Agent Evaluation\"**" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "136db492-d6ca-4215-96f9-439c23538241", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "LangChain+ Client" + ], + "text/plain": [ + "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can navigate to the UI by clicking on the link below\n", + "client" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "63ed6561-6574-43b3-a653-fe410aa8a617", + "metadata": {}, + "source": [ + "## Running an Evaluation Chain\n", + "\n", + "Manually comparing the results of chains in the UI is effective, but it can be time consuming.\n", + "It's easier to leverage AI-assisted feedback to evaluate your agent's performance.\n", + "\n", + "A few ways of doing this include:\n", + "- Adding ground-truth answers as outputs to the dataset and evaluating relative to those references.\n", + "- Evaluating the overall agent trajectory based on the tool usage and intermediate steps.\n", + "- Evaluating performance based on 'context' such as retrieved documents or tool results.\n", + "- Evaluating 'aspects' of the agent's response in a reference-free manner using targeted agent prompts.\n", + " \n", + "Below, we show how to run an evaluation chain that compares the model output with the ground-truth answers.\n", + "\n", + "**Note: the feedback API is currently experimental and subject to change.**" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "35db4025-9183-4e5f-ba14-0b1b380f49c7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.evaluation.run_evaluators import get_qa_evaluator, get_criteria_evaluator\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "eval_llm = ChatOpenAI(temperature=0)\n", + "\n", + "qa_evaluator = get_qa_evaluator(eval_llm)\n", + "helpfulness_evaluator = get_criteria_evaluator(eval_llm, \"helpfulness\")\n", + "conciseness_evaluator = get_criteria_evaluator(eval_llm, \"conciseness\")\n", + "custom_criteria_evaluator = get_criteria_evaluator(\n", + " eval_llm,\n", + " {\n", + " \"fifth-grader-score\": \"Do you have to be smarter than a fifth grader to answer this question?\"\n", + " },\n", + ")\n", + "\n", + "evaluators = [\n", + " qa_evaluator,\n", + " helpfulness_evaluator,\n", + " conciseness_evaluator,\n", + " custom_criteria_evaluator,\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4c94a738-dcd3-442e-b8e7-dd36459f56e3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5fce1ce42a8c4110b7d12443948ac697", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from tqdm.notebook import tqdm\n", + "\n", + "feedbacks = []\n", + "runs = client.list_runs(project_name=chain_results[\"project_name\"], execution_order=1, error=False)\n", + "for run in tqdm(runs):\n", + " if run.outputs is None:\n", + " continue\n", + " eval_feedback = []\n", + " for evaluator in evaluators:\n", + " eval_feedback.append(client.aevaluate_run(run, evaluator))\n", + " feedbacks.extend(await asyncio.gather(*eval_feedback))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8696f167-dc75-4ef8-8bb3-ac1ce8324f30", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "LangChain+ Client" + ], + "text/plain": [ + "LangChainPlusClient (API URL: https://dev.api.langchain.plus)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5037e54-2c5a-4993-9b46-2a98773d3079", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" } - ], - "source": [ - "client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5037e54-2c5a-4993-9b46-2a98773d3079", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/poetry.lock b/poetry.lock index f93dccf2de..448c6ebcbf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4362,13 +4362,13 @@ tests = ["doctest", "pytest", "pytest-mock"] [[package]] name = "langchainplus-sdk" -version = "0.0.15" -description = "Client library to connect to the LangChainPlus LLM Tracing and Evaluation Platform." +version = "0.0.17" +description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchainplus_sdk-0.0.15-py3-none-any.whl", hash = "sha256:e69bdbc8af6007ef2f774248d2483bbaf2d75712b1acc9ea50eda3b9f6dc567d"}, - {file = "langchainplus_sdk-0.0.15.tar.gz", hash = "sha256:ce40e9e3b6d42741f0a2aa89f83a12f2648f38690a9dd57e5fe3a56f2f232908"}, + {file = "langchainplus_sdk-0.0.17-py3-none-any.whl", hash = "sha256:899675fe850bb0829691ce7643d5c3b4425de1535b6f2d6ce1e5f5457ffb05bf"}, + {file = "langchainplus_sdk-0.0.17.tar.gz", hash = "sha256:6520c864a23dcadbe6fb7233a117347f6acc32725a97758e59354704c50de303"}, ] [package.dependencies] @@ -11771,4 +11771,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "09d46ad12369c6a16513558618553623cd520c2855bff3b8fe8248e1b18cbb94" +content-hash = "6e495e4f58127a5d2001385404b973896e275f5ca71a6ebe856cb114977189d1" diff --git a/pyproject.toml b/pyproject.toml index cce089dfe2..d5ba5d5622 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ pyspark = {version = "^3.4.0", optional = true} clarifai = {version = "9.1.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} nebula3-python = {version = "^3.4.0", optional = true} -langchainplus-sdk = ">=0.0.13" +langchainplus-sdk = ">=0.0.17" awadb = {version = "^0.3.3", optional = true} azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true} openllm = {version = ">=0.1.6", optional = true} diff --git a/tests/unit_tests/client/test_runner_utils.py b/tests/unit_tests/client/test_runner_utils.py index 2c0ccd4403..162418f321 100644 --- a/tests/unit_tests/client/test_runner_utils.py +++ b/tests/unit_tests/client/test_runner_utils.py @@ -176,7 +176,7 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: {"result": f"Result for example {example.id}"} for _ in range(n_repetitions) ] - def mock_create_session(*args: Any, **kwargs: Any) -> None: + def mock_create_project(*args: Any, **kwargs: Any) -> None: pass with mock.patch.object( @@ -186,7 +186,7 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: ), mock.patch( "langchain.client.runner_utils._arun_llm_or_chain", new=mock_arun_chain ), mock.patch.object( - LangChainPlusClient, "create_session", new=mock_create_session + LangChainPlusClient, "create_project", new=mock_create_project ): client = LangChainPlusClient(api_url="http://localhost:1984", api_key="123") chain = mock.MagicMock() @@ -195,7 +195,7 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: dataset_name="test", llm_or_chain_factory=lambda: chain, concurrency_level=2, - session_name="test_session", + project_name="test_project", num_repetitions=num_repetitions, client=client, )