add optimization notebook (#18155)

3 months ago · c673717c2b
parent 5ab69f907f
commit c673717c2b
3 changed files with 1650 additions and 1 deletions
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@ -32,6 +32,6 @@ jobs:
      - name: Codespell
        uses: codespell-project/actions-codespell@v2
        with:
-          skip: guide_imports.json,*.ambr
+          skip: guide_imports.json,*.ambr,./cookbook/data/imdb_top_1000.csv
          ignore_words_list: ${{ steps.extract_ignore_words.outputs.ignore_words_list }}
          exclude_file: libs/community/langchain_community/llms/yuan2.py
--- a/cookbook/data/imdb_top_1000.csv
+++ b/cookbook/data/imdb_top_1000.csv
--- a/cookbook/optimization.ipynb
+++ b/cookbook/optimization.ipynb
@ -0,0 +1,648 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c7fe38bc",
+   "metadata": {},
+   "source": [
+    "# Optimization\n",
+    "\n",
+    "This notebook goes over how to optimize chains using LangChain and [LangSmith](https://smith.langchain.com)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f87ccd5",
+   "metadata": {},
+   "source": [
+    "## Set up\n",
+    "\n",
+    "We will set an environment variable for LangSmith, and load the relevant data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "236bedc5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = \"movie-qa\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a3fed0dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "7cfff337",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"data/imdb_top_1000.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2d20fb9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Released_Year\"] = df[\"Released_Year\"].astype(int, errors=\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09fc8fe2",
+   "metadata": {},
+   "source": [
+    "## Create the initial retrieval chain\n",
+    "\n",
+    "We will use a self-query retriever"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f71e24e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "\n",
+    "embeddings = OpenAIEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "8881ea8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "records = df.to_dict(\"records\")\n",
+    "documents = [Document(page_content=d[\"Overview\"], metadata=d) for d in records]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8f495423",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectorstore = Chroma.from_documents(documents, embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "31d33d62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains.query_constructor.base import AttributeInfo\n",
+    "from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "metadata_field_info = [\n",
+    "    AttributeInfo(\n",
+    "        name=\"Released_Year\",\n",
+    "        description=\"The year the movie was released\",\n",
+    "        type=\"int\",\n",
+    "    ),\n",
+    "    AttributeInfo(\n",
+    "        name=\"Series_Title\",\n",
+    "        description=\"The title of the movie\",\n",
+    "        type=\"str\",\n",
+    "    ),\n",
+    "    AttributeInfo(\n",
+    "        name=\"Genre\",\n",
+    "        description=\"The genre of the movie\",\n",
+    "        type=\"string\",\n",
+    "    ),\n",
+    "    AttributeInfo(\n",
+    "        name=\"IMDB_Rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
+    "    ),\n",
+    "]\n",
+    "document_content_description = \"Brief summary of a movie\"\n",
+    "llm = ChatOpenAI(temperature=0)\n",
+    "retriever = SelfQueryRetriever.from_llm(\n",
+    "    llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a731533b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.runnables import RunnablePassthrough"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "05181849",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "feed4be6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = ChatPromptTemplate.from_template(\n",
+    "    \"\"\"Answer the user's question based on the below information:\n",
+    "\n",
+    "Information:\n",
+    "\n",
+    "{info}\n",
+    "\n",
+    "Question: {question}\"\"\"\n",
+    ")\n",
+    "generator = (prompt | ChatOpenAI() | StrOutputParser()).with_config(\n",
+    "    run_name=\"generator\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "eb16cc9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain = (\n",
+    "    RunnablePassthrough.assign(info=(lambda x: x[\"question\"]) | retriever) | generator\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c70911cc",
+   "metadata": {},
+   "source": [
+    "## Run examples\n",
+    "\n",
+    "Run examples through the chain. This can either be manually, or using a list of examples, or production traffic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "19a88d13",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'One of the horror movies released in the early 2000s is \"The Ring\" (2002), directed by Gore Verbinski.'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain.invoke({\"question\": \"what is a horror movie released in early 2000s\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17f9cdae",
+   "metadata": {},
+   "source": [
+    "## Annotate\n",
+    "\n",
+    "Now, go to LangSmitha and annotate those examples as correct or incorrect"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e211da6",
+   "metadata": {},
+   "source": [
+    "## Create Dataset\n",
+    "\n",
+    "We can now create a dataset from those runs.\n",
+    "\n",
+    "What we will do is find the runs marked as correct, then grab the sub-chains from them. Specifically, the query generator sub chain and the final generation step"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e4024267",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langsmith import Client\n",
+    "\n",
+    "client = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "3814efc5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "runs = list(\n",
+    "    client.list_runs(\n",
+    "        project_name=\"movie-qa\",\n",
+    "        execution_order=1,\n",
+    "        filter=\"and(eq(feedback_key, 'correctness'), eq(feedback_score, 1))\",\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "len(runs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "3eb123e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen_runs = []\n",
+    "query_runs = []\n",
+    "for r in runs:\n",
+    "    gen_runs.extend(\n",
+    "        list(\n",
+    "            client.list_runs(\n",
+    "                project_name=\"movie-qa\",\n",
+    "                filter=\"eq(name, 'generator')\",\n",
+    "                trace_id=r.trace_id,\n",
+    "            )\n",
+    "        )\n",
+    "    )\n",
+    "    query_runs.extend(\n",
+    "        list(\n",
+    "            client.list_runs(\n",
+    "                project_name=\"movie-qa\",\n",
+    "                filter=\"eq(name, 'query_constructor')\",\n",
+    "                trace_id=r.trace_id,\n",
+    "            )\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "a4397026",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'what is a high school comedy released in early 2000s'}"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "runs[0].inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3fa6ad2a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': 'One high school comedy released in the early 2000s is \"Mean Girls\" starring Lindsay Lohan, Rachel McAdams, and Tina Fey.'}"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "runs[0].outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "1fda5b4b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'query': 'what is a high school comedy released in early 2000s'}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_runs[0].inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "1a1a51e6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': {'query': 'high school comedy',\n",
+       "  'filter': {'operator': 'and',\n",
+       "   'arguments': [{'comparator': 'eq', 'attribute': 'Genre', 'value': 'comedy'},\n",
+       "    {'operator': 'and',\n",
+       "     'arguments': [{'comparator': 'gte',\n",
+       "       'attribute': 'Released_Year',\n",
+       "       'value': 2000},\n",
+       "      {'comparator': 'lt', 'attribute': 'Released_Year', 'value': 2010}]}]}}}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_runs[0].outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "e9d9966b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'question': 'what is a high school comedy released in early 2000s',\n",
+       " 'info': []}"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gen_runs[0].inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "bc113f3d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'output': 'One high school comedy released in the early 2000s is \"Mean Girls\" starring Lindsay Lohan, Rachel McAdams, and Tina Fey.'}"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gen_runs[0].outputs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6cca74e5",
+   "metadata": {},
+   "source": [
+    "## Create datasets\n",
+    "\n",
+    "We can now create datasets for the query generation and final generation step.\n",
+    "We do this so that (1) we can inspect the datapoints, (2) we can edit them if needed, (3) we can add to them over time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "69966f0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.create_dataset(\"movie-query_constructor\")\n",
+    "\n",
+    "inputs = [r.inputs for r in query_runs]\n",
+    "outputs = [r.outputs for r in query_runs]\n",
+    "\n",
+    "client.create_examples(\n",
+    "    inputs=inputs, outputs=outputs, dataset_name=\"movie-query_constructor\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7e15770e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.create_dataset(\"movie-generator\")\n",
+    "\n",
+    "inputs = [r.inputs for r in gen_runs]\n",
+    "outputs = [r.outputs for r in gen_runs]\n",
+    "\n",
+    "client.create_examples(inputs=inputs, outputs=outputs, dataset_name=\"movie-generator\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "61cf9bcd",
+   "metadata": {},
+   "source": [
+    "## Use as few shot examples\n",
+    "\n",
+    "We can now pull down a dataset and use them as few shot examples in a future chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d9c79173",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = list(client.list_examples(dataset_name=\"movie-query_constructor\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "a1771dd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "\n",
+    "def filter_to_string(_filter):\n",
+    "    if \"operator\" in _filter:\n",
+    "        args = [filter_to_string(f) for f in _filter[\"arguments\"]]\n",
+    "        return f\"{_filter['operator']}({','.join(args)})\"\n",
+    "    else:\n",
+    "        comparator = _filter[\"comparator\"]\n",
+    "        attribute = json.dumps(_filter[\"attribute\"])\n",
+    "        value = json.dumps(_filter[\"value\"])\n",
+    "        return f\"{comparator}({attribute}, {value})\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "e67a3530",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_examples = []\n",
+    "\n",
+    "for e in examples:\n",
+    "    if \"filter\" in e.outputs[\"output\"]:\n",
+    "        string_filter = filter_to_string(e.outputs[\"output\"][\"filter\"])\n",
+    "    else:\n",
+    "        string_filter = \"NO_FILTER\"\n",
+    "    model_examples.append(\n",
+    "        (\n",
+    "            e.inputs[\"query\"],\n",
+    "            {\"query\": e.outputs[\"output\"][\"query\"], \"filter\": string_filter},\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "84593135",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "retriever1 = SelfQueryRetriever.from_llm(\n",
+    "    llm,\n",
+    "    vectorstore,\n",
+    "    document_content_description,\n",
+    "    metadata_field_info,\n",
+    "    verbose=True,\n",
+    "    chain_kwargs={\"examples\": model_examples},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "4ec9bb92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain1 = (\n",
+    "    RunnablePassthrough.assign(info=(lambda x: x[\"question\"]) | retriever1) | generator\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "64eb88e2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1. \"Saving Private Ryan\" (1998) - Directed by Steven Spielberg, this war film follows a group of soldiers during World War II as they search for a missing paratrooper.\\n\\n2. \"The Matrix\" (1999) - Directed by the Wachowskis, this science fiction action film follows a computer hacker who discovers the truth about the reality he lives in.\\n\\n3. \"Lethal Weapon 4\" (1998) - Directed by Richard Donner, this action-comedy film follows two mismatched detectives as they investigate a Chinese immigrant smuggling ring.\\n\\n4. \"The Fifth Element\" (1997) - Directed by Luc Besson, this science fiction action film follows a cab driver who must protect a mysterious woman who holds the key to saving the world.\\n\\n5. \"The Rock\" (1996) - Directed by Michael Bay, this action thriller follows a group of rogue military men who take over Alcatraz and threaten to launch missiles at San Francisco.'"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chain1.invoke(\n",
+    "    {\"question\": \"what are good action movies made before 2000 but after 1997?\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1ee8b55",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}