From 491c27f861b1a66b29b64595dc1b364748cf5f32 Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Fri, 28 Apr 2023 10:42:44 -0700 Subject: [PATCH] PlayWright Web Browser Toolkit (#3262) Adds a PlayWright web browser toolkit with the following tools: - NavigateTool (navigate_browser) - navigate to a URL - NavigateBackTool (previous_page) - wait for an element to appear - ClickTool (click_element) - click on an element (specified by selector) - ExtractTextTool (extract_text) - use beautiful soup to extract text from the current web page - ExtractHyperlinksTool (extract_hyperlinks) - use beautiful soup to extract hyperlinks from the current web page - GetElementsTool (get_elements) - select elements by CSS selector - CurrentPageTool (current_page) - get the current page URL --- .../agents/toolkits/examples/playwright.ipynb | 179 ++++++++++++++++++ langchain/agents/agent_toolkits/__init__.py | 2 + .../agent_toolkits/playwright/__init__.py | 4 + .../agent_toolkits/playwright/toolkit.py | 66 +++++++ langchain/tools/__init__.py | 22 ++- langchain/tools/playwright/__init__.py | 21 ++ langchain/tools/playwright/base.py | 40 ++++ langchain/tools/playwright/click.py | 29 +++ langchain/tools/playwright/current_page.py | 21 ++ .../tools/playwright/extract_hyperlinks.py | 64 +++++++ langchain/tools/playwright/extract_text.py | 39 ++++ langchain/tools/playwright/get_elements.py | 62 ++++++ langchain/tools/playwright/navigate.py | 29 +++ langchain/tools/playwright/navigate_back.py | 31 +++ langchain/tools/playwright/utils.py | 35 ++++ 15 files changed, 642 insertions(+), 2 deletions(-) create mode 100644 docs/modules/agents/toolkits/examples/playwright.ipynb create mode 100644 langchain/agents/agent_toolkits/playwright/__init__.py create mode 100644 langchain/agents/agent_toolkits/playwright/toolkit.py create mode 100644 langchain/tools/playwright/__init__.py create mode 100644 langchain/tools/playwright/base.py create mode 100644 langchain/tools/playwright/click.py create mode 100644 langchain/tools/playwright/current_page.py create mode 100644 langchain/tools/playwright/extract_hyperlinks.py create mode 100644 langchain/tools/playwright/extract_text.py create mode 100644 langchain/tools/playwright/get_elements.py create mode 100644 langchain/tools/playwright/navigate.py create mode 100644 langchain/tools/playwright/navigate_back.py create mode 100644 langchain/tools/playwright/utils.py diff --git a/docs/modules/agents/toolkits/examples/playwright.ipynb b/docs/modules/agents/toolkits/examples/playwright.ipynb new file mode 100644 index 00000000..0d628c07 --- /dev/null +++ b/docs/modules/agents/toolkits/examples/playwright.ipynb @@ -0,0 +1,179 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PlayWright Browser Toolkit\n", + "\n", + "This toolkit is used to interact with the browser. While other tools (like the Requests tools) are fine for static sites, Browser toolkits let your agent navigate the web and interact with dynamically rendered sites. Some tools bundled within the Browser toolkit include:\n", + "\n", + "- NavigateTool (navigate_browser) - navigate to a URL\n", + "- NavigateBackTool (previous_page) - wait for an element to appear\n", + "- ClickTool (click_element) - click on an element (specified by selector)\n", + "- ExtractTextTool (extract_text) - use beautiful soup to extract text from the current web page\n", + "- ExtractHyperlinksTool (extract_hyperlinks) - use beautiful soup to extract hyperlinks from the current web page\n", + "- GetElementsTool (get_elements) - select elements by CSS selector\n", + "- CurrentPageTool (current_page) - get the current page URL\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install playwright > /dev/null\n", + "# !pip install lxml\n", + "\n", + "# If this is your first time using playwright, you'll have to install a browser executable.\n", + "# Running `playwright install` by default installs a chromium browser executable.\n", + "# playwright install" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents.agent_toolkits import PlayWrightBrowserToolkit" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[ClickTool(name='click_element', description='Click on an element with the given CSS selector', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " NavigateTool(name='navigate_browser', description='Navigate a browser to the specified URL', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " NavigateBackTool(name='previous_webpage', description='Navigate back to the previous page in the browser history', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " ExtractTextTool(name='extract_text', description='Extract all the text on the current webpage', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " ExtractHyperlinksTool(name='extract_hyperlinks', description='Extract all hyperlinks on the current webpage', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " GetElementsTool(name='get_elements', description='Retrieve elements in the current web page matching the given CSS selector', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>),\n", + " CurrentWebPageTool(name='current_webpage', description='Returns the URL of the current page', args_schema=, return_direct=False, verbose=False, callback_manager=, browser= version=112.0.5615.29>)]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This import is required only for jupyter notebooks, since they have their own eventloop\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "toolkit = PlayWrightBrowserToolkit()\n", + "tools = toolkit.get_tools()\n", + "tools" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "tools_by_name = {tool.name: tool for tool in tools}\n", + "navigate_tool = tools_by_name[\"navigate_browser\"]\n", + "get_elements_tool = tools_by_name[\"get_elements\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Navigating to https://web.archive.org/web/20230428131116/https://www.cnn.com/world returned status code 200'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "navigate_tool.run({\"url\": \"https://web.archive.org/web/20230428131116/https://www.cnn.com/world\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[{\"innerText\": \"As US and Philippine defense ties grow, China warns over Taiwan tensions\"}, {\"innerText\": \"Almost two-thirds of elephant habitat lost across Asia, study finds\"}, {\"innerText\": \"\\\\u2018We don\\\\u2019t sleep \\\\u2026 I would call it fainting\\\\u2019: Working as a doctor in Sudan\\\\u2019s crisis\"}, {\"innerText\": \"Kenya arrests second pastor to face criminal charges \\\\u2018related to mass killing of his followers\\\\u2019\"}, {\"innerText\": \"Ocean census aims to discover 100,000 previously unknown marine species\"}, {\"innerText\": \"Iran\\\\u2019s Navy seizes Marshall Islands-flagged ship\"}, {\"innerText\": \"German landlord wins right to sunbathe naked despite complaints from tenants\"}, {\"innerText\": \"Single people should be \\\\u2018valued\\\\u2019 as Jesus was single, Church of England says\"}, {\"innerText\": \"Turkey\\\\u2019s Erdogan cancels public appearances after falling ill as election nears\"}, {\"innerText\": \"Drought-stricken Spain braces for exceptionally high temperatures expected to break April records\"}, {\"innerText\": \"With Zelensky call, Xi Jinping steps up bid to broker peace \\\\u2013 but does he have a plan?\"}, {\"innerText\": \"Indian and Chinese defense ministers to meet face to face\"}, {\"innerText\": \"Pope to allow women to vote at global bishops meeting\"}, {\"innerText\": \"Catastrophic drought that\\\\u2019s pushed millions into crisis made 100 times more likely by climate change, analysis finds\"}, {\"innerText\": \"\\\\u2018Bring Ya Ya home\\\\u2019: How a panda in the US turbocharged Chinese nationalist sentiment\"}, {\"innerText\": \"\\\\u2018Often they shoot at each other\\\\u2019: Ukrainian drone operator details chaos in Russian ranks\"}, {\"innerText\": \"U.S. talk show host Jerry Springer dies at 79\"}, {\"innerText\": \"Girl to get life-saving treatment for rare immune disease\"}, {\"innerText\": \"Wall Street Journal editor discusses reporter\\\\u2019s arrest in Moscow\"}, {\"innerText\": \"Belgium destroys shipment of American beer after taking issue with \\\\u2018Champagne of Beer\\\\u2019 slogan\"}, {\"innerText\": \"UK Prime Minister Rishi Sunak rocked by resignation of top ally Raab over bullying allegations\"}, {\"innerText\": \"Coronation mishaps King Charles III will want to avoid\"}, {\"innerText\": \"Russian jet accidentally drops bomb on Russian city of Belgorod, state media says\"}, {\"innerText\": \"Queen Camilla\\\\u2019s son, Tom Parker Bowles, says his mother \\\\u2018married the person she loved\\\\u2019\"}, {\"innerText\": \"These Iranian activists fled for freedom. The regime still managed to find them\"}, {\"innerText\": \"A divided Israel stands at a perilous crossroads on its 75th birthday\"}, {\"innerText\": \"Palestinian reporter breaks barriers by reporting in Hebrew on Israeli TV\"}, {\"innerText\": \"One-fifth of water pollution comes from textile dyes. But a shellfish-inspired solution could clean it up\"}, {\"innerText\": \"\\\\u2018People sacrificed their lives for just\\\\u00a010 dollars\\\\u2019: At least 78 killed in Yemen crowd surge\"}, {\"innerText\": \"Israeli police say two men shot near Jewish tomb in Jerusalem in suspected \\\\u2018terror attack\\\\u2019\"}, {\"innerText\": \"Houthis try to reassure skeptics they won\\\\u2019t seek full control of Yemen, as Saudis eye exit\"}, {\"innerText\": \"The week in 33 photos\"}, {\"innerText\": \"Hong Kong\\\\u2019s endangered turtles\"}, {\"innerText\": \"In pictures: Britain\\\\u2019s Queen Camilla\"}, {\"innerText\": \"In pictures: Charles and Camilla\"}, {\"innerText\": \"For years, a UK mining giant was untouchable in Zambia for pollution until a former miner\\\\u2019s son took them on\"}, {\"innerText\": \"Former Sudanese minister Ahmed Haroun wanted on war crimes charges freed from Khartoum prison\"}, {\"innerText\": \"WHO warns of \\\\u2018biological risk\\\\u2019 after Sudan fighters seize lab, as violence mars US-brokered ceasefire\"}, {\"innerText\": \"Rival generals are battling for control in Sudan. Here\\\\u2019s a simple guide to the fighting\"}, {\"innerText\": \"How Colombia\\\\u2019s Petro, a former leftwing guerrilla, found his opening in Washington\"}, {\"innerText\": \"Bolsonaro accidentally created Facebook post questioning Brazil election results, say his attorneys\"}, {\"innerText\": \"Crowd kills over a dozen suspected gang members in Haiti\"}, {\"innerText\": \"Thousands of tequila bottles containing liquid meth seized\"}, {\"innerText\": \"Why send a US stealth submarine to South Korea \\\\u2013 and tell the world about it?\"}, {\"innerText\": \"Fukushima\\\\u2019s fishing industry survived a nuclear disaster. 12 years on, it fears Tokyo\\\\u2019s next move may finish it off\"}, {\"innerText\": \"Singapore executes man for trafficking two pounds of cannabis\"}, {\"innerText\": \"Conservative Thai party looks to woo voters with promise to legalize sex toys\"}, {\"innerText\": \"Watch planes take off in Japan \\\\u2014 from an onsen\"}, {\"innerText\": \"Bilt\\\\u2019s May Rent Day promotion: Fly to Europe for as few as 6,000 Bilt points\"}, {\"innerText\": \"Cabeau just dropped the Evolution Earth, a new eco-minded travel pillow\"}, {\"innerText\": \"Nemo\\\\u2019s Garden: The future of farming could be under the sea\"}, {\"innerText\": \"Cadence\\\\u2019s cult-favorite travel capsules are now available in more sizes\"}, {\"innerText\": \"Judy Blume\\\\u2019s books were formative for generations of readers. Here\\\\u2019s why they endure\"}, {\"innerText\": \"Craft, salvage and sustainability take center stage at Milan Design Week\"}, {\"innerText\": \"Life-sized chocolate King Charles III sculpture unveiled to celebrate coronation\"}, {\"innerText\": \"Rock legend Freddie Mercury\\\\u2019s personal possessions are going up for auction\"}, {\"innerText\": \"John Travolta\\\\u2019s white \\\\u2018Saturday Night Fever\\\\u2019 suit fetches $260K at auction\"}, {\"innerText\": \"The South is in the crosshairs of severe weather again, as the multi-day threat of large hail and tornadoes continues\"}, {\"innerText\": \"Spring snowmelt has cities along the Mississippi bracing for flooding in homes and businesses\"}, {\"innerText\": \"Know the difference between a tornado watch, a tornado warning and a tornado emergency\"}, {\"innerText\": \"Large hail drops on parts of Texas and Florida as South remains at risk of severe storms\"}, {\"innerText\": \"House Republicans adopt bill raising U.S. debt limit and cutting spending\"}, {\"innerText\": \"Judge puts hold on Missouri rule limiting gender-affirming care\"}, {\"innerText\": \"Eleven people killed in suspected Maoist militant attack in central India\"}, {\"innerText\": \"Prosecutors tell judge intel the Air National Guardsman took \\\\u2018far exceeds\\\\u2019 what has been reported\"}, {\"innerText\": \"The son of a Sudanese doctor killed in a mortar attack speaks with Rosemary Church\"}, {\"innerText\": \"Melting snow worsens flooding along the Mississippi River\"}, {\"innerText\": \"Writer E. Jean Carroll testifies in civil suit against Donald Trump\"}, {\"innerText\": \"Nepalese authorities issue record number of Everest permits\"}, {\"innerText\": \"Cruise passenger disappears overboard during trip from Australia to Hawaii\"}, {\"innerText\": \"Watch South Korean president sing \\\\u2018American Pie\\\\u2019 for Biden\"}, {\"innerText\": \"See Russian fighter jet on fire after blowing up mid-flight\"}, {\"innerText\": \"Disney Sues Florida Governor Ron DeSantis\"}, {\"innerText\": \"Yasmeen Lari, \\\\u2018starchitect\\\\u2019 turned social engineer, wins one of architecture\\\\u2019s most coveted prizes\"}, {\"innerText\": \"A massive, newly restored Frank Lloyd Wright mansion is up for sale\"}, {\"innerText\": \"Are these the most sustainable architectural projects in the world?\"}, {\"innerText\": \"Step inside a $72 million London townhouse in a converted army barracks\"}, {\"innerText\": \"A 3D-printing company is preparing to build on the lunar surface. But first, a moonshot at home\"}, {\"innerText\": \"Carolina Panthers select QB Bryce Young with first pick of NFL Draft\"}, {\"innerText\": \"Brittney Griner says she\\\\u2019ll \\\\u2018never go overseas again\\\\u2019 to play unless it\\\\u2019s for the Olympics after being detained in Russia\"}, {\"innerText\": \"Pel\\\\u00e9 added to Portuguese dictionary as an adjective for \\\\u2018out of the ordinary\\\\u2019\"}, {\"innerText\": \"Players reimbursing fans and the interim manager getting sacked: How Tottenham Hotspur fell into disrepair\"}, {\"innerText\": \"This CNN Hero is recruiting recreational divers to help rebuild reefs in Florida one coral at a time\"}, {\"innerText\": \"This CNN Hero offers judgment-free veterinary care for the pets of those experiencing homelessness\"}, {\"innerText\": \"Don\\\\u2019t give up on milestones: A CNN Hero\\\\u2019s message for Autism Awareness Month\"}, {\"innerText\": \"CNN Hero of the Year Nelly Cheboi returned to Kenya with plans to lift more students out of poverty\"}]'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The browser is shared across tools, so the agent can interact in a stateful manner\n", + "get_elements_tool.run({\"selector\": \".container__headline\", \"attributes\": [\"innerText\"]})" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://web.archive.org/web/20230428033754/https://www.cnn.com/world'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# If the agent wants to remember the current webpage, it can use the `current_webpage` tool\n", + "tools_by_name['current_webpage'].run({})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/agents/agent_toolkits/__init__.py b/langchain/agents/agent_toolkits/__init__.py index 5aa6fc8e..54f1a2e6 100644 --- a/langchain/agents/agent_toolkits/__init__.py +++ b/langchain/agents/agent_toolkits/__init__.py @@ -8,6 +8,7 @@ from langchain.agents.agent_toolkits.nla.toolkit import NLAToolkit from langchain.agents.agent_toolkits.openapi.base import create_openapi_agent from langchain.agents.agent_toolkits.openapi.toolkit import OpenAPIToolkit from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent +from langchain.agents.agent_toolkits.playwright.toolkit import PlayWrightBrowserToolkit from langchain.agents.agent_toolkits.powerbi.base import create_pbi_agent from langchain.agents.agent_toolkits.powerbi.chat_base import create_pbi_chat_agent from langchain.agents.agent_toolkits.powerbi.toolkit import PowerBIToolkit @@ -46,4 +47,5 @@ __all__ = [ "create_csv_agent", "ZapierToolkit", "JiraToolkit", + "PlayWrightBrowserToolkit", ] diff --git a/langchain/agents/agent_toolkits/playwright/__init__.py b/langchain/agents/agent_toolkits/playwright/__init__.py new file mode 100644 index 00000000..e8c51061 --- /dev/null +++ b/langchain/agents/agent_toolkits/playwright/__init__.py @@ -0,0 +1,4 @@ +"""Playwright browser toolkit.""" +from langchain.agents.agent_toolkits.playwright.toolkit import PlayWrightBrowserToolkit + +__all__ = ["PlayWrightBrowserToolkit"] diff --git a/langchain/agents/agent_toolkits/playwright/toolkit.py b/langchain/agents/agent_toolkits/playwright/toolkit.py new file mode 100644 index 00000000..cd372275 --- /dev/null +++ b/langchain/agents/agent_toolkits/playwright/toolkit.py @@ -0,0 +1,66 @@ +"""Playwright web browser toolkit.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Type + +from pydantic import Extra, Field, root_validator + +from langchain.agents.agent_toolkits.base import BaseToolkit +from langchain.tools.base import BaseTool +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.click import ClickTool +from langchain.tools.playwright.current_page import CurrentWebPageTool +from langchain.tools.playwright.extract_hyperlinks import ExtractHyperlinksTool +from langchain.tools.playwright.extract_text import ExtractTextTool +from langchain.tools.playwright.get_elements import GetElementsTool +from langchain.tools.playwright.navigate import NavigateTool +from langchain.tools.playwright.navigate_back import NavigateBackTool +from langchain.tools.playwright.utils import create_playwright_browser + +if TYPE_CHECKING: + from playwright.async_api import Browser as AsyncBrowser + + +class PlayWrightBrowserToolkit(BaseToolkit): + """Toolkit for web browser tools.""" + + browser: AsyncBrowser = Field(default_factory=create_playwright_browser) + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + arbitrary_types_allowed = True + + @root_validator + def check_args(cls, values: dict) -> dict: + """Check that the arguments are valid.""" + try: + from playwright.async_api import Browser as AsyncBrowser # noqa: F401 + except ImportError: + raise ValueError( + "The 'playwright' package is required to use this tool." + " Please install it with 'pip install playwright'." + ) + return values + + def get_tools(self) -> List[BaseTool]: + """Get the tools in the toolkit.""" + tool_classes: List[Type[BaseBrowserTool]] = [ + ClickTool, + NavigateTool, + NavigateBackTool, + ExtractTextTool, + ExtractHyperlinksTool, + GetElementsTool, + CurrentWebPageTool, + ] + + return [tool_cls.from_browser(self.browser) for tool_cls in tool_classes] + + @classmethod + def from_browser(cls, browser: AsyncBrowser) -> PlayWrightBrowserToolkit: + from playwright.async_api import Browser as AsyncBrowser + + cls.update_forward_refs(AsyncBrowser=AsyncBrowser) + return cls(browser=browser) diff --git a/langchain/tools/__init__.py b/langchain/tools/__init__.py index 7ffcd0db..3cf1dad2 100644 --- a/langchain/tools/__init__.py +++ b/langchain/tools/__init__.py @@ -8,20 +8,38 @@ from langchain.tools.google_search.tool import GoogleSearchResults, GoogleSearch from langchain.tools.ifttt import IFTTTWebhook from langchain.tools.openapi.utils.api_models import APIOperation from langchain.tools.openapi.utils.openapi_utils import OpenAPISpec +from langchain.tools.playwright import ( + BaseBrowserTool, + ClickTool, + CurrentWebPageTool, + ExtractHyperlinksTool, + ExtractTextTool, + GetElementsTool, + NavigateBackTool, + NavigateTool, +) from langchain.tools.plugin import AIPluginTool __all__ = [ - "AIPluginTool", "APIOperation", + "BaseBrowserTool", + "BaseTool", "BingSearchResults", "BingSearchRun", + "ClickTool", + "CurrentWebPageTool", "DuckDuckGoSearchResults", "DuckDuckGoSearchRun", "DuckDuckGoSearchRun", + "ExtractHyperlinksTool", + "ExtractTextTool", + "GetElementsTool", "GooglePlacesTool", "GoogleSearchResults", "GoogleSearchRun", "IFTTTWebhook", + "NavigateBackTool", + "NavigateTool", "OpenAPISpec", - "BaseTool", + "AIPluginTool", ] diff --git a/langchain/tools/playwright/__init__.py b/langchain/tools/playwright/__init__.py new file mode 100644 index 00000000..8f7e6153 --- /dev/null +++ b/langchain/tools/playwright/__init__.py @@ -0,0 +1,21 @@ +"""Browser tools and toolkit.""" + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.click import ClickTool +from langchain.tools.playwright.current_page import CurrentWebPageTool +from langchain.tools.playwright.extract_hyperlinks import ExtractHyperlinksTool +from langchain.tools.playwright.extract_text import ExtractTextTool +from langchain.tools.playwright.get_elements import GetElementsTool +from langchain.tools.playwright.navigate import NavigateTool +from langchain.tools.playwright.navigate_back import NavigateBackTool + +__all__ = [ + "NavigateTool", + "NavigateBackTool", + "ExtractTextTool", + "ExtractHyperlinksTool", + "GetElementsTool", + "BaseBrowserTool", + "ClickTool", + "CurrentWebPageTool", +] diff --git a/langchain/tools/playwright/base.py b/langchain/tools/playwright/base.py new file mode 100644 index 00000000..95db7f92 --- /dev/null +++ b/langchain/tools/playwright/base.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from pydantic import Field, root_validator + +from langchain.tools.base import BaseTool +from langchain.tools.playwright.utils import create_playwright_browser, run_async + +if TYPE_CHECKING: + from playwright.async_api import Browser as AsyncBrowser + + +class BaseBrowserTool(BaseTool): + """Base class for browser tools.""" + + browser: AsyncBrowser = Field(default_factory=create_playwright_browser) + + @root_validator + def check_args(cls, values: dict) -> dict: + """Check that the arguments are valid.""" + try: + from playwright.async_api import Browser as AsyncBrowser # noqa: F401 + except ImportError: + raise ValueError( + "The 'playwright' package is required to use this tool." + " Please install it with 'pip install playwright'." + ) + return values + + def _run(self, *args: Any, **kwargs: Any) -> str: + """Use the tool.""" + return run_async(self._arun(*args, **kwargs)) + + @classmethod + def from_browser(cls, browser: AsyncBrowser) -> BaseBrowserTool: + from playwright.async_api import Browser as AsyncBrowser + + cls.update_forward_refs(AsyncBrowser=AsyncBrowser) + return cls(browser=browser) diff --git a/langchain/tools/playwright/click.py b/langchain/tools/playwright/click.py new file mode 100644 index 00000000..0d963d35 --- /dev/null +++ b/langchain/tools/playwright/click.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import Type + +from pydantic import BaseModel, Field + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import ( + get_current_page, +) + + +class ClickToolInput(BaseModel): + """Input for ClickTool.""" + + selector: str = Field(..., description="CSS selector for the element to click") + + +class ClickTool(BaseBrowserTool): + name: str = "click_element" + description: str = "Click on an element with the given CSS selector" + args_schema: Type[BaseModel] = ClickToolInput + + async def _arun(self, selector: str) -> str: + """Use the tool.""" + page = await get_current_page(self.browser) + # Navigate to the desired webpage before using this tool + await page.click(selector) + return f"Clicked element '{selector}'" diff --git a/langchain/tools/playwright/current_page.py b/langchain/tools/playwright/current_page.py new file mode 100644 index 00000000..bde0ff8a --- /dev/null +++ b/langchain/tools/playwright/current_page.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from typing import Type + +from pydantic import BaseModel + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import ( + get_current_page, +) + + +class CurrentWebPageTool(BaseBrowserTool): + name: str = "current_webpage" + description: str = "Returns the URL of the current page" + args_schema: Type[BaseModel] = BaseModel + + async def _arun(self) -> str: + """Use the tool.""" + page = await get_current_page(self.browser) + return str(page.url) diff --git a/langchain/tools/playwright/extract_hyperlinks.py b/langchain/tools/playwright/extract_hyperlinks.py new file mode 100644 index 00000000..9e792f19 --- /dev/null +++ b/langchain/tools/playwright/extract_hyperlinks.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Type + +from pydantic import BaseModel, Field, root_validator + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import get_current_page + +if TYPE_CHECKING: + pass + + +class ExtractHyperlinksToolInput(BaseModel): + """Input for ExtractHyperlinksTool.""" + + absolute_urls: bool = Field( + default=False, + description="Return absolute URLs instead of relative URLs", + ) + + +class ExtractHyperlinksTool(BaseBrowserTool): + """Extract all hyperlinks on the page.""" + + name: str = "extract_hyperlinks" + description: str = "Extract all hyperlinks on the current webpage" + args_schema: Type[BaseModel] = ExtractHyperlinksToolInput + + @root_validator + def check_args(cls, values: dict) -> dict: + """Check that the arguments are valid.""" + try: + from bs4 import BeautifulSoup # noqa: F401 + except ImportError: + raise ValueError( + "The 'beautifulsoup4' package is required to use this tool." + " Please install it with 'pip install beautifulsoup4'." + ) + return values + + async def _arun(self, absolute_urls: bool = False) -> str: + """Use the tool.""" + from urllib.parse import urljoin + + from bs4 import BeautifulSoup + + page = await get_current_page(self.browser) + html_content = await page.content() + + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_content, "lxml") + + # Find all the anchor elements and extract their href attributes + anchors = soup.find_all("a") + if absolute_urls: + base_url = page.url + links = [urljoin(base_url, anchor.get("href", "")) for anchor in anchors] + else: + links = [anchor.get("href", "") for anchor in anchors] + + # Return the list of links as a JSON string + return json.dumps(links) diff --git a/langchain/tools/playwright/extract_text.py b/langchain/tools/playwright/extract_text.py new file mode 100644 index 00000000..0ced6d35 --- /dev/null +++ b/langchain/tools/playwright/extract_text.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import Type + +from pydantic import BaseModel, root_validator + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import get_current_page + + +class ExtractTextTool(BaseBrowserTool): + name: str = "extract_text" + description: str = "Extract all the text on the current webpage" + args_schema: Type[BaseModel] = BaseModel + + @root_validator + def check_args(cls, values: dict) -> dict: + """Check that the arguments are valid.""" + try: + from bs4 import BeautifulSoup # noqa: F401 + except ImportError: + raise ValueError( + "The 'beautifulsoup4' package is required to use this tool." + " Please install it with 'pip install beautifulsoup4'." + ) + return values + + async def _arun(self) -> str: + """Use the tool.""" + # Use Beautiful Soup since it's faster than looping through the elements + from bs4 import BeautifulSoup + + page = await get_current_page(self.browser) + html_content = await page.content() + + # Parse the HTML content with BeautifulSoup + soup = BeautifulSoup(html_content, "lxml") + + return " ".join(text for text in soup.stripped_strings) diff --git a/langchain/tools/playwright/get_elements.py b/langchain/tools/playwright/get_elements.py new file mode 100644 index 00000000..2a90112d --- /dev/null +++ b/langchain/tools/playwright/get_elements.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, List, Optional, Sequence, Type + +from pydantic import BaseModel, Field + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import get_current_page + +if TYPE_CHECKING: + from playwright.async_api import Page as AsyncPage + + +class GetElementsToolInput(BaseModel): + """Input for GetElementsTool.""" + + selector: str = Field( + ..., + description="CSS selector, such as '*', 'div', 'p', 'a', #id, .classname", + ) + attributes: List[str] = Field( + default_factory=lambda: ["innerText"], + description="Set of attributes to retrieve for each element", + ) + + +async def _get_elements( + page: AsyncPage, selector: str, attributes: Sequence[str] +) -> List[dict]: + """Get elements matching the given CSS selector.""" + elements = await page.query_selector_all(selector) + results = [] + for element in elements: + result = {} + for attribute in attributes: + if attribute == "innerText": + val: Optional[str] = await element.inner_text() + else: + val = await element.get_attribute(attribute) + if val is not None and val.strip() != "": + result[attribute] = val + if result: + results.append(result) + return results + + +class GetElementsTool(BaseBrowserTool): + name: str = "get_elements" + description: str = ( + "Retrieve elements in the current web page matching the given CSS selector" + ) + args_schema: Type[BaseModel] = GetElementsToolInput + + async def _arun( + self, selector: str, attributes: Sequence[str] = ["innerText"] + ) -> str: + """Use the tool.""" + page = await get_current_page(self.browser) + # Navigate to the desired webpage before using this tool + results = await _get_elements(page, selector, attributes) + return json.dumps(results) diff --git a/langchain/tools/playwright/navigate.py b/langchain/tools/playwright/navigate.py new file mode 100644 index 00000000..cac35719 --- /dev/null +++ b/langchain/tools/playwright/navigate.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import Type + +from pydantic import BaseModel, Field + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import ( + get_current_page, +) + + +class NavigateToolInput(BaseModel): + """Input for NavigateToolInput.""" + + url: str = Field(..., description="url to navigate to") + + +class NavigateTool(BaseBrowserTool): + name: str = "navigate_browser" + description: str = "Navigate a browser to the specified URL" + args_schema: Type[BaseModel] = NavigateToolInput + + async def _arun(self, url: str) -> str: + """Use the tool.""" + page = await get_current_page(self.browser) + response = await page.goto(url) + status = response.status if response else "unknown" + return f"Navigating to {url} returned status code {status}" diff --git a/langchain/tools/playwright/navigate_back.py b/langchain/tools/playwright/navigate_back.py new file mode 100644 index 00000000..114fc81c --- /dev/null +++ b/langchain/tools/playwright/navigate_back.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from typing import Type + +from pydantic import BaseModel + +from langchain.tools.playwright.base import BaseBrowserTool +from langchain.tools.playwright.utils import ( + get_current_page, +) + + +class NavigateBackTool(BaseBrowserTool): + """Navigate back to the previous page in the browser history.""" + + name: str = "previous_webpage" + description: str = "Navigate back to the previous page in the browser history" + args_schema: Type[BaseModel] = BaseModel + + async def _arun(self) -> str: + """Use the tool.""" + page = await get_current_page(self.browser) + response = await page.go_back() + + if response: + return ( + f"Navigated back to the previous page with URL '{response.url}'." + " Status code {response.status}" + ) + else: + return "Unable to navigate back; no previous page in the history" diff --git a/langchain/tools/playwright/utils.py b/langchain/tools/playwright/utils.py new file mode 100644 index 00000000..4903836a --- /dev/null +++ b/langchain/tools/playwright/utils.py @@ -0,0 +1,35 @@ +"""Utilities for the Playwright browser tools.""" +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING, Any, Coroutine, TypeVar + +if TYPE_CHECKING: + from playwright.async_api import Browser as AsyncBrowser + from playwright.async_api import Page as AsyncPage + + +async def get_current_page(browser: AsyncBrowser) -> AsyncPage: + if not browser.contexts: + context = await browser.new_context() + return await context.new_page() + context = browser.contexts[0] # Assuming you're using the default browser context + if not context.pages: + return await context.new_page() + # Assuming the last page in the list is the active one + return context.pages[-1] + + +def create_playwright_browser() -> AsyncBrowser: + from playwright.async_api import async_playwright + + browser = run_async(async_playwright().start()) + return run_async(browser.chromium.launch(headless=True)) + + +T = TypeVar("T") + + +def run_async(coro: Coroutine[Any, Any, T]) -> T: + event_loop = asyncio.get_event_loop() + return event_loop.run_until_complete(coro)