mirror of
https://github.com/hwchase17/langchain
synced 2024-11-16 06:13:16 +00:00
649 lines
16 KiB
Plaintext
649 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c7fe38bc",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Optimization\n",
|
|
"\n",
|
|
"This notebook goes over how to optimize chains using LangChain and [LangSmith](https://smith.langchain.com)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2f87ccd5",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set up\n",
|
|
"\n",
|
|
"We will set an environment variable for LangSmith, and load the relevant data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "236bedc5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"\n",
|
|
"os.environ[\"LANGCHAIN_PROJECT\"] = \"movie-qa\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "a3fed0dd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "7cfff337",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_csv(\"data/imdb_top_1000.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "2d20fb9c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df[\"Released_Year\"] = df[\"Released_Year\"].astype(int, errors=\"ignore\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "09fc8fe2",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create the initial retrieval chain\n",
|
|
"\n",
|
|
"We will use a self-query retriever"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "f71e24e2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.schema import Document\n",
|
|
"from langchain_community.vectorstores import Chroma\n",
|
|
"from langchain_openai import OpenAIEmbeddings\n",
|
|
"\n",
|
|
"embeddings = OpenAIEmbeddings()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "8881ea8e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"records = df.to_dict(\"records\")\n",
|
|
"documents = [Document(page_content=d[\"Overview\"], metadata=d) for d in records]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "8f495423",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"vectorstore = Chroma.from_documents(documents, embeddings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "31d33d62",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
|
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
|
"from langchain_openai import ChatOpenAI\n",
|
|
"\n",
|
|
"metadata_field_info = [\n",
|
|
" AttributeInfo(\n",
|
|
" name=\"Released_Year\",\n",
|
|
" description=\"The year the movie was released\",\n",
|
|
" type=\"int\",\n",
|
|
" ),\n",
|
|
" AttributeInfo(\n",
|
|
" name=\"Series_Title\",\n",
|
|
" description=\"The title of the movie\",\n",
|
|
" type=\"str\",\n",
|
|
" ),\n",
|
|
" AttributeInfo(\n",
|
|
" name=\"Genre\",\n",
|
|
" description=\"The genre of the movie\",\n",
|
|
" type=\"string\",\n",
|
|
" ),\n",
|
|
" AttributeInfo(\n",
|
|
" name=\"IMDB_Rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
|
" ),\n",
|
|
"]\n",
|
|
"document_content_description = \"Brief summary of a movie\"\n",
|
|
"llm = ChatOpenAI(temperature=0)\n",
|
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
|
" llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "a731533b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_core.runnables import RunnablePassthrough"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "05181849",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain_core.output_parsers import StrOutputParser\n",
|
|
"from langchain_core.prompts import ChatPromptTemplate"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "feed4be6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"prompt = ChatPromptTemplate.from_template(\n",
|
|
" \"\"\"Answer the user's question based on the below information:\n",
|
|
"\n",
|
|
"Information:\n",
|
|
"\n",
|
|
"{info}\n",
|
|
"\n",
|
|
"Question: {question}\"\"\"\n",
|
|
")\n",
|
|
"generator = (prompt | ChatOpenAI() | StrOutputParser()).with_config(\n",
|
|
" run_name=\"generator\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "eb16cc9a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"chain = (\n",
|
|
" RunnablePassthrough.assign(info=(lambda x: x[\"question\"]) | retriever) | generator\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c70911cc",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Run examples\n",
|
|
"\n",
|
|
"Run examples through the chain. This can either be manually, or using a list of examples, or production traffic"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "19a88d13",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'One of the horror movies released in the early 2000s is \"The Ring\" (2002), directed by Gore Verbinski.'"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"chain.invoke({\"question\": \"what is a horror movie released in early 2000s\"})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "17f9cdae",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Annotate\n",
|
|
"\n",
|
|
"Now, go to LangSmitha and annotate those examples as correct or incorrect"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5e211da6",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create Dataset\n",
|
|
"\n",
|
|
"We can now create a dataset from those runs.\n",
|
|
"\n",
|
|
"What we will do is find the runs marked as correct, then grab the sub-chains from them. Specifically, the query generator sub chain and the final generation step"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "e4024267",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langsmith import Client\n",
|
|
"\n",
|
|
"client = Client()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "3814efc5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"14"
|
|
]
|
|
},
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"runs = list(\n",
|
|
" client.list_runs(\n",
|
|
" project_name=\"movie-qa\",\n",
|
|
" execution_order=1,\n",
|
|
" filter=\"and(eq(feedback_key, 'correctness'), eq(feedback_score, 1))\",\n",
|
|
" )\n",
|
|
")\n",
|
|
"\n",
|
|
"len(runs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "3eb123e0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"gen_runs = []\n",
|
|
"query_runs = []\n",
|
|
"for r in runs:\n",
|
|
" gen_runs.extend(\n",
|
|
" list(\n",
|
|
" client.list_runs(\n",
|
|
" project_name=\"movie-qa\",\n",
|
|
" filter=\"eq(name, 'generator')\",\n",
|
|
" trace_id=r.trace_id,\n",
|
|
" )\n",
|
|
" )\n",
|
|
" )\n",
|
|
" query_runs.extend(\n",
|
|
" list(\n",
|
|
" client.list_runs(\n",
|
|
" project_name=\"movie-qa\",\n",
|
|
" filter=\"eq(name, 'query_constructor')\",\n",
|
|
" trace_id=r.trace_id,\n",
|
|
" )\n",
|
|
" )\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "a4397026",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'question': 'what is a high school comedy released in early 2000s'}"
|
|
]
|
|
},
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"runs[0].inputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "3fa6ad2a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'output': 'One high school comedy released in the early 2000s is \"Mean Girls\" starring Lindsay Lohan, Rachel McAdams, and Tina Fey.'}"
|
|
]
|
|
},
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"runs[0].outputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "1fda5b4b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'query': 'what is a high school comedy released in early 2000s'}"
|
|
]
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"query_runs[0].inputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "1a1a51e6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'output': {'query': 'high school comedy',\n",
|
|
" 'filter': {'operator': 'and',\n",
|
|
" 'arguments': [{'comparator': 'eq', 'attribute': 'Genre', 'value': 'comedy'},\n",
|
|
" {'operator': 'and',\n",
|
|
" 'arguments': [{'comparator': 'gte',\n",
|
|
" 'attribute': 'Released_Year',\n",
|
|
" 'value': 2000},\n",
|
|
" {'comparator': 'lt', 'attribute': 'Released_Year', 'value': 2010}]}]}}}"
|
|
]
|
|
},
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"query_runs[0].outputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "e9d9966b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'question': 'what is a high school comedy released in early 2000s',\n",
|
|
" 'info': []}"
|
|
]
|
|
},
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"gen_runs[0].inputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "bc113f3d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'output': 'One high school comedy released in the early 2000s is \"Mean Girls\" starring Lindsay Lohan, Rachel McAdams, and Tina Fey.'}"
|
|
]
|
|
},
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"gen_runs[0].outputs"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6cca74e5",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Create datasets\n",
|
|
"\n",
|
|
"We can now create datasets for the query generation and final generation step.\n",
|
|
"We do this so that (1) we can inspect the datapoints, (2) we can edit them if needed, (3) we can add to them over time"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "69966f0e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"client.create_dataset(\"movie-query_constructor\")\n",
|
|
"\n",
|
|
"inputs = [r.inputs for r in query_runs]\n",
|
|
"outputs = [r.outputs for r in query_runs]\n",
|
|
"\n",
|
|
"client.create_examples(\n",
|
|
" inputs=inputs, outputs=outputs, dataset_name=\"movie-query_constructor\"\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "7e15770e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"client.create_dataset(\"movie-generator\")\n",
|
|
"\n",
|
|
"inputs = [r.inputs for r in gen_runs]\n",
|
|
"outputs = [r.outputs for r in gen_runs]\n",
|
|
"\n",
|
|
"client.create_examples(inputs=inputs, outputs=outputs, dataset_name=\"movie-generator\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "61cf9bcd",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Use as few shot examples\n",
|
|
"\n",
|
|
"We can now pull down a dataset and use them as few shot examples in a future chain"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "d9c79173",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"examples = list(client.list_examples(dataset_name=\"movie-query_constructor\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "a1771dd0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"\n",
|
|
"\n",
|
|
"def filter_to_string(_filter):\n",
|
|
" if \"operator\" in _filter:\n",
|
|
" args = [filter_to_string(f) for f in _filter[\"arguments\"]]\n",
|
|
" return f\"{_filter['operator']}({','.join(args)})\"\n",
|
|
" else:\n",
|
|
" comparator = _filter[\"comparator\"]\n",
|
|
" attribute = json.dumps(_filter[\"attribute\"])\n",
|
|
" value = json.dumps(_filter[\"value\"])\n",
|
|
" return f\"{comparator}({attribute}, {value})\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"id": "e67a3530",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_examples = []\n",
|
|
"\n",
|
|
"for e in examples:\n",
|
|
" if \"filter\" in e.outputs[\"output\"]:\n",
|
|
" string_filter = filter_to_string(e.outputs[\"output\"][\"filter\"])\n",
|
|
" else:\n",
|
|
" string_filter = \"NO_FILTER\"\n",
|
|
" model_examples.append(\n",
|
|
" (\n",
|
|
" e.inputs[\"query\"],\n",
|
|
" {\"query\": e.outputs[\"output\"][\"query\"], \"filter\": string_filter},\n",
|
|
" )\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "84593135",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"retriever1 = SelfQueryRetriever.from_llm(\n",
|
|
" llm,\n",
|
|
" vectorstore,\n",
|
|
" document_content_description,\n",
|
|
" metadata_field_info,\n",
|
|
" verbose=True,\n",
|
|
" chain_kwargs={\"examples\": model_examples},\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "4ec9bb92",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"chain1 = (\n",
|
|
" RunnablePassthrough.assign(info=(lambda x: x[\"question\"]) | retriever1) | generator\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"id": "64eb88e2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'1. \"Saving Private Ryan\" (1998) - Directed by Steven Spielberg, this war film follows a group of soldiers during World War II as they search for a missing paratrooper.\\n\\n2. \"The Matrix\" (1999) - Directed by the Wachowskis, this science fiction action film follows a computer hacker who discovers the truth about the reality he lives in.\\n\\n3. \"Lethal Weapon 4\" (1998) - Directed by Richard Donner, this action-comedy film follows two mismatched detectives as they investigate a Chinese immigrant smuggling ring.\\n\\n4. \"The Fifth Element\" (1997) - Directed by Luc Besson, this science fiction action film follows a cab driver who must protect a mysterious woman who holds the key to saving the world.\\n\\n5. \"The Rock\" (1996) - Directed by Michael Bay, this action thriller follows a group of rogue military men who take over Alcatraz and threaten to launch missiles at San Francisco.'"
|
|
]
|
|
},
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"chain1.invoke(\n",
|
|
" {\"question\": \"what are good action movies made before 2000 but after 1997?\"}\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e1ee8b55",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|