You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/docs/use_cases/autonomous_agents/marathon_times.ipynb

694 lines
24 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "14f8b67b",
"metadata": {},
"source": [
"## AutoGPT example finding Winning Marathon Times\n",
"\n",
"* Implementation of https://github.com/Significant-Gravitas/Auto-GPT \n",
"* With LangChain primitives (LLMs, PromptTemplates, VectorStores, Embeddings, Tools)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef972313-c05a-4c49-8fd1-03e599e21033",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pip install bs4\n",
"# !pip install nest_asyncio"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1cff42fd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# General \n",
"import pandas as pd\n",
"from langchain.experimental.autonomous_agents.autogpt.agent import AutoGPT\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent\n",
"from langchain.docstore.document import Document\n",
"from langchain.chains import RetrievalQA\n",
"import asyncio\n",
"import nest_asyncio\n",
"\n",
"\n",
"# Needed synce jupyter runs an async eventloop\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "01283ac7-1da0-41ba-8011-bd455d21dd82",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=1.0)"
]
},
{
"cell_type": "markdown",
"id": "192496a7",
"metadata": {},
"source": [
"### Set up tools\n",
"\n",
"* We'll set up an AutoGPT with a `search` tool, and `write-file` tool, and a `read-file` tool, and a web browsing tool"
]
},
{
"cell_type": "markdown",
"id": "708a426f",
"metadata": {},
"source": [
"Define any other `tools` you want to use here"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "cef4c150-0ef1-4a33-836b-01062fec134e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Tools\n",
"from typing import Optional\n",
"from langchain.agents import tool\n",
"from langchain.tools.file_management.read import ReadFileTool\n",
"from langchain.tools.file_management.write import WriteFileTool\n",
"\n",
"@tool\n",
"def process_csv(csv_file_path: str, instructions: str, output_path: Optional[str] = None) -> str:\n",
" \"\"\"Process a CSV by with pandas in a limited REPL. Only use this after writing data to disk as a csv file. Any figures must be saved to disk to be viewed by the human. Instructions should be written in natural language, not code. Assume the dataframe is already loaded.\"\"\"\n",
" try:\n",
" df = pd.read_csv(csv_file_path)\n",
" except Exception as e:\n",
" return f\"Error: {e}\"\n",
" agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)\n",
" if output_path is not None:\n",
" instructions += f\" Save output to disk at {output_path}\"\n",
" try:\n",
" return agent.run(instructions)\n",
" except Exception as e:\n",
" return f\"Error: {e}\"\n"
]
},
{
"cell_type": "markdown",
"id": "51c07298-00e0-42d6-8aff-bd2e6bbd35a3",
"metadata": {},
"source": [
"**Web Search Tool**\n",
"\n",
"No need for API Tokens to use this tool, but it will require an optional dependency"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4afdedb2-f295-4ab8-9397-3640f5eeeed3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pip install duckduckgo_search"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "45f143de-e49e-4e27-88eb-ee44a4fdf933",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import json\n",
"from duckduckgo_search import ddg"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2e799f4-86fb-4190-a298-4ae5c7b7a540",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"@tool\n",
"def web_search(query: str, num_results: int = 8) -> str:\n",
" \"\"\"Useful for general internet search queries.\"\"\"\n",
" search_results = []\n",
" if not query:\n",
" return json.dumps(search_results)\n",
"\n",
" results = ddg(query, max_results=num_results)\n",
" if not results:\n",
" return json.dumps(search_results)\n",
"\n",
" for j in results:\n",
" search_results.append(j)\n",
"\n",
" return json.dumps(search_results, ensure_ascii=False, indent=4)"
]
},
{
"cell_type": "markdown",
"id": "69975008-654a-4cbb-bdf6-63c8bae07eaa",
"metadata": {
"tags": []
},
"source": [
"**Browse a web page with PlayWright**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "6bb5e47b-0f54-4faa-ae42-49a28fa5497b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# !pip install playwright\n",
"# !playwright install"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "26b497d7-8e52-4c7f-8e7e-da0a48820a3c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"async def async_load_playwright(url: str) -> str:\n",
" \"\"\"Load the specified URLs using Playwright and parse using BeautifulSoup.\"\"\"\n",
" from bs4 import BeautifulSoup\n",
" from playwright.async_api import async_playwright\n",
"\n",
" results = \"\"\n",
" async with async_playwright() as p:\n",
" browser = await p.chromium.launch(headless=True)\n",
" try:\n",
" page = await browser.new_page()\n",
" await page.goto(url)\n",
"\n",
" page_source = await page.content()\n",
" soup = BeautifulSoup(page_source, \"html.parser\")\n",
"\n",
" for script in soup([\"script\", \"style\"]):\n",
" script.extract()\n",
"\n",
" text = soup.get_text()\n",
" lines = (line.strip() for line in text.splitlines())\n",
" chunks = (phrase.strip() for line in lines for phrase in line.split(\" \"))\n",
" results = \"\\n\".join(chunk for chunk in chunks if chunk)\n",
" except Exception as e:\n",
" results = f\"Error: {e}\"\n",
" await browser.close()\n",
" return results\n",
"\n",
"def run_async(coro):\n",
" event_loop = asyncio.get_event_loop()\n",
" return event_loop.run_until_complete(coro)\n",
"\n",
"@tool\n",
"def browse_web_page(url: str) -> str:\n",
" \"\"\"Verbose way to scrape a whole webpage. Likely to cause issues parsing.\"\"\"\n",
" return run_async(async_load_playwright(url))"
]
},
{
"cell_type": "markdown",
"id": "5ea71762-67ca-4e75-8c4d-00563064be71",
"metadata": {},
"source": [
"**Q&A Over a webpage**\n",
"\n",
"Help the model ask more directed questions of web pages to avoid cluttering its memory"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "1842929d-f18d-4edc-9fdd-82c929181141",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.tools.base import BaseTool\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"\n",
"from langchain.document_loaders import WebBaseLoader\n",
"from pydantic import Field\n",
"from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain\n",
"\n",
"def _get_text_splitter():\n",
" return RecursiveCharacterTextSplitter(\n",
" # Set a really small chunk size, just to show.\n",
" chunk_size = 500,\n",
" chunk_overlap = 20,\n",
" length_function = len,\n",
" )\n",
"\n",
"\n",
"class WebpageQATool(BaseTool):\n",
" name = \"query_webpage\"\n",
" description = \"Browse a webpage and retrieve the information relevant to the question.\"\n",
" text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)\n",
" qa_chain: BaseCombineDocumentsChain\n",
" \n",
" def _run(self, url: str, question: str) -> str:\n",
" \"\"\"Useful for browsing websites and scraping the text information.\"\"\"\n",
" result = browse_web_page.run(url)\n",
" docs = [Document(page_content=result, metadata={\"source\": url})]\n",
" web_docs = self.text_splitter.split_documents(docs)\n",
" results = []\n",
" # TODO: Handle this with a MapReduceChain\n",
" for i in range(0, len(web_docs), 4):\n",
" input_docs = web_docs[i:i+4]\n",
" window_result = self.qa_chain({\"input_documents\": input_docs, \"question\": question}, return_only_outputs=True)\n",
" results.append(f\"Response from window {i} - {window_result}\")\n",
" results_docs = [Document(page_content=\"\\n\".join(results), metadata={\"source\": url})]\n",
" return self.qa_chain({\"input_documents\": results_docs, \"question\": question}, return_only_outputs=True)\n",
" \n",
" async def _arun(self, url: str, question: str) -> str:\n",
" raise NotImplementedError\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "e6f72bd0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))"
]
},
{
"cell_type": "markdown",
"id": "8e39ee28",
"metadata": {},
"source": [
"### Set up memory\n",
"\n",
"* The memory here is used for the agents intermediate steps"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1df7b724",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Memory\n",
"import faiss\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.docstore import InMemoryDocstore\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.tools.human.tool import HumanInputRun\n",
"\n",
"embeddings_model = OpenAIEmbeddings()\n",
"embedding_size = 1536\n",
"index = faiss.IndexFlatL2(embedding_size)\n",
"vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})"
]
},
{
"cell_type": "markdown",
"id": "e40fd657",
"metadata": {},
"source": [
"### Setup model and AutoGPT\n",
"\n",
"`Model set-up`"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "88c8b184-67d7-4c35-84ae-9b14bef8c4e3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"tools = [\n",
" web_search,\n",
" WriteFileTool(),\n",
" ReadFileTool(),\n",
" process_csv,\n",
" query_website_tool,\n",
" # HumanInputRun(), # Activate if you want the permit asking for help from the human\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "709c08c2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"agent = AutoGPT.from_llm_and_tools(\n",
" ai_name=\"Tom\",\n",
" ai_role=\"Assistant\",\n",
" tools=tools,\n",
" llm=llm,\n",
" memory=vectorstore.as_retriever(search_kwargs={\"k\": 8}),\n",
" # human_in_the_loop=True, # Set to True if you want to add feedback at each step.\n",
")\n",
"# agent.chain.verbose = True"
]
},
{
"cell_type": "markdown",
"id": "fc9b51ba",
"metadata": {},
"source": [
"### AutoGPT as a research / data munger \n",
"\n",
"#### `inflation` and `college tuition`\n",
" \n",
"Let's use AutoGPT as researcher and data munger / cleaner.\n",
" \n",
"I spent a lot of time over the years crawling data sources and cleaning data. \n",
"\n",
"Let's see if AutoGPT can do all of this for us!\n",
"\n",
"Here is the prompt comparing `inflation` and `college tuition`."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "64455d70-a134-4d11-826a-33e34c2ce287",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I need to find the winning Boston Marathon times for the past 5 years.\",\n",
" \"reasoning\": \"I'll start by conducting a web search for the requested information.\",\n",
" \"plan\": \"- Conduct a web search\\n- Query relevant webpage\\n- Generate table\\n- Save data to file\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will begin by conducting a web search to find the past 5 years' Boston Marathon winning times.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"web_search\",\n",
" \"args\": {\n",
" \"query\": \"winning Boston Marathon times for the past 5 years\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I found several relevant search results, and I will use the query_webpage command on a specific URL to gather the information.\",\n",
" \"reasoning\": \"The Boston Athletic Association's official website (www.baa.org) is likely the most accurate source.\",\n",
" \"plan\": \"- Query the Boston Athletic Association webpage\\n- Filter and parse the data\\n- Generate table and save to file\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will now query the Boston Athletic Association webpage to retrieve the information on the past 5 years' winning times.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"query_webpage\",\n",
" \"args\": {\n",
" \"url\": \"https://www.baa.org/races/boston-marathon/results/champions\",\n",
" \"question\": \"winning times of the Boston Marathon for the past 5 years\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I have the winning times of the Boston Marathon for the past 5 years. I need to create a table with the names, countries of origin, and times.\",\n",
" \"reasoning\": \"I can use the information I've retrieved to generate a CSV file, then process the CSV file to create the table.\",\n",
" \"plan\": \"- Save data to a CSV file\\n- Process CSV file to generate table\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will save the Boston Marathon data to a CSV file and then process it to generate a table.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"write_file\",\n",
" \"args\": {\n",
" \"file_path\": \"boston_marathon_data.csv\",\n",
" \"text\": \"Year,Name,Country,Time\\n2022,Evans Chebet,Kenya,2:06:51\\n2021,Benson Kipruto,Kenya,2:09:51\\n2019,Lawrence Cherono,Kenya,2:07:57\\n2018,Yuki Kawauchi,Japan,2:15:58\\n2017,Geoffrey Kirui,Kenya,2:09:37\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I have the winning times for the Boston Marathon for the past 5 years. Now, I need to turn it into a table, save it as a CSV, and process it.\",\n",
" \"reasoning\": \"Saving data as a CSV will allow for a clean table format and output.\",\n",
" \"plan\": \"- Format the data as a CSV\\n- Write the CSV to disk\\n- Process the CSV and generate a table\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will now format the winning times data as a CSV, save it, and process it to generate a table.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"write_file\",\n",
" \"args\": {\n",
" \"file_path\": \"boston_marathon_winners.csv\",\n",
" \"text\": \"Year,Name,Country,Time\\n2022,Evans Chebet,Kenya,2:06:51\\n2021,Benson Kipruto,Kenya,2:09:51\\n2019,Lawrence Cherono,Kenya,2:07:57\\n2018,Yuki Kawauchi,Japan,2:15:58\\n2017,Geoffrey Kirui,Kenya,2:09:37\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I have the winning times for the past 5 years, now I need to organize the information into a table and save it as a file.\",\n",
" \"reasoning\": \"Storing the information in a table file format will make it easier to access and present the data.\",\n",
" \"plan\": \"- Save data in a CSV file\\n- Generate a table\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will save the winning marathon times in a table format as a CSV file and generate a table.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"write_file\",\n",
" \"args\": {\n",
" \"file_path\": \"boston_marathon_winners.csv\",\n",
" \"text\": \"Year,Name,Country,Time\\n2022,Evans Chebet,Kenya,2:06:51\\n2021,Benson Kipruto,Kenya,2:09:51\\n2019,Lawrence Cherono,Kenya,2:07:57\\n2018,Yuki Kawauchi,Japan,2:15:58\\n2017,Geoffrey Kirui,Kenya,2:09:37\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I previously found the winning times of the Boston Marathon for the past 5 years and saved it as a CSV file. Now, I need to process the CSV to create a table with the names, countries of origin, and times.\",\n",
" \"reasoning\": \"Processing the CSV will help generate a clean table to provide the answer.\",\n",
" \"plan\": \"- Process the CSV file\\n- Generate a table with the required columns\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I will process the Boston Marathon winners' CSV file to create a table with the names, countries of origin, and times.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"process_csv\",\n",
" \"args\": {\n",
" \"csv_file_path\": \"boston_marathon_winners.csv\",\n",
" \"instructions\": \"Generate a table with columns Year, Name, Country, and Time.\"\n",
" }\n",
" }\n",
"}\n",
"\n",
"\n",
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3mThought: I need to convert the provided dataframe to a string in Markdown table format.\n",
"Action: python_repl_ast\n",
"Action Input: df.to_markdown(index=False)\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3m| Year | Name | Country | Time |\n",
"|-------:|:-----------------|:----------|:--------|\n",
"| 2022 | Evans Chebet | Kenya | 2:06:51 |\n",
"| 2021 | Benson Kipruto | Kenya | 2:09:51 |\n",
"| 2019 | Lawrence Cherono | Kenya | 2:07:57 |\n",
"| 2018 | Yuki Kawauchi | Japan | 2:15:58 |\n",
"| 2017 | Geoffrey Kirui | Kenya | 2:09:37 |\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3mI now know the final answer.\n",
"Final Answer: \n",
"\n",
"| Year | Name | Country | Time |\n",
"|-------:|:-----------------|:----------|:--------|\n",
"| 2022 | Evans Chebet | Kenya | 2:06:51 |\n",
"| 2021 | Benson Kipruto | Kenya | 2:09:51 |\n",
"| 2019 | Lawrence Cherono | Kenya | 2:07:57 |\n",
"| 2018 | Yuki Kawauchi | Japan | 2:15:58 |\n",
"| 2017 | Geoffrey Kirui | Kenya | 2:09:37 |\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Failed to persist run: Object of type 'FAISS' is not JSON serializable\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" \"thoughts\": {\n",
" \"text\": \"I have already retrieved the winning times for the past 5 years of the Boston Marathon and created a table. My objective is complete.\",\n",
" \"reasoning\": \"I have completed my task on this topic, so I don't need to use any other commands.\",\n",
" \"plan\": \"- Inform the user that the task is complete\",\n",
" \"criticism\": \"None\",\n",
" \"speak\": \"I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete.\"\n",
" },\n",
" \"command\": {\n",
" \"name\": \"finish\",\n",
" \"args\": {\n",
" \"response\": \"I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete.\"\n",
" }\n",
" }\n",
"}\n"
]
},
{
"data": {
"text/plain": [
"'I have found the winning times for the past 5 years of the Boston Marathon and created a table. My task is complete.'"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.run([\"What were the winning boston marathon times for the past 5 years? Generate a table of the names, countries of origin, and times.\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6b4f96e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}