langchain/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py

from __future__ import annotations

import json
from typing import TYPE_CHECKING, Any, Optional, Type

from langchain_core.callbacks import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator

from langchain_community.tools.playwright.base import BaseBrowserTool
from langchain_community.tools.playwright.utils import (
    aget_current_page,
    get_current_page,
)

if TYPE_CHECKING:
    pass


class ExtractHyperlinksToolInput(BaseModel):
    """Input for ExtractHyperlinksTool."""

    absolute_urls: bool = Field(
        default=False,
        description="Return absolute URLs instead of relative URLs",
    )


class ExtractHyperlinksTool(BaseBrowserTool):
    """Extract all hyperlinks on the page."""

    name: str = "extract_hyperlinks"
    description: str = "Extract all hyperlinks on the current webpage"
    args_schema: Type[BaseModel] = ExtractHyperlinksToolInput

    @root_validator
    def check_bs_import(cls, values: dict) -> dict:
        """Check that the arguments are valid."""
        try:
            from bs4 import BeautifulSoup  # noqa: F401
        except ImportError:
            raise ImportError(
                "The 'beautifulsoup4' package is required to use this tool."
                " Please install it with 'pip install beautifulsoup4'."
            )
        return values

    @staticmethod
    def scrape_page(page: Any, html_content: str, absolute_urls: bool) -> str:
        from urllib.parse import urljoin

        from bs4 import BeautifulSoup

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, "lxml")

        # Find all the anchor elements and extract their href attributes
        anchors = soup.find_all("a")
        if absolute_urls:
            base_url = page.url
            links = [urljoin(base_url, anchor.get("href", "")) for anchor in anchors]
        else:
            links = [anchor.get("href", "") for anchor in anchors]
        # Return the list of links as a JSON string
        return json.dumps(links)

    def _run(
        self,
        absolute_urls: bool = False,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Use the tool."""
        if self.sync_browser is None:
            raise ValueError(f"Synchronous browser not provided to {self.name}")
        page = get_current_page(self.sync_browser)
        html_content = page.content()
        return self.scrape_page(page, html_content, absolute_urls)

    async def _arun(
        self,
        absolute_urls: bool = False,
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
    ) -> str:
        """Use the tool asynchronously."""
        if self.async_browser is None:
            raise ValueError(f"Asynchronous browser not provided to {self.name}")
        page = await aget_current_page(self.async_browser)
        html_content = await page.content()
        return self.scrape_page(page, html_content, absolute_urls)
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`from __future__ import annotations`

			`import json`
			`from typing import TYPE_CHECKING, Any, Optional, Type`

			`from langchain_core.callbacks import (`
			`AsyncCallbackManagerForToolRun,`
			`CallbackManagerForToolRun,`
			`)`
			`from langchain_core.pydantic_v1 import BaseModel, Field, root_validator`

			`from langchain_community.tools.playwright.base import BaseBrowserTool`
			`from langchain_community.tools.playwright.utils import (`
			`aget_current_page,`
			`get_current_page,`
			`)`

			`if TYPE_CHECKING:`
			`pass`


			`class ExtractHyperlinksToolInput(BaseModel):`
			`"""Input for ExtractHyperlinksTool."""`

			`absolute_urls: bool = Field(`
			`default=False,`
			`description="Return absolute URLs instead of relative URLs",`
			`)`


			`class ExtractHyperlinksTool(BaseBrowserTool):`
			`"""Extract all hyperlinks on the page."""`

			`name: str = "extract_hyperlinks"`
			`description: str = "Extract all hyperlinks on the current webpage"`
			`args_schema: Type[BaseModel] = ExtractHyperlinksToolInput`

			`@root_validator`
			`def check_bs_import(cls, values: dict) -> dict:`
			`"""Check that the arguments are valid."""`
			`try:`
			`from bs4 import BeautifulSoup # noqa: F401`
			`except ImportError:`
			`raise ImportError(`
			`"The 'beautifulsoup4' package is required to use this tool."`
			`" Please install it with 'pip install beautifulsoup4'."`
			`)`
			`return values`

			`@staticmethod`
			`def scrape_page(page: Any, html_content: str, absolute_urls: bool) -> str:`
			`from urllib.parse import urljoin`

			`from bs4 import BeautifulSoup`

			`# Parse the HTML content with BeautifulSoup`
			`soup = BeautifulSoup(html_content, "lxml")`

			`# Find all the anchor elements and extract their href attributes`
			`anchors = soup.find_all("a")`
			`if absolute_urls:`
			`base_url = page.url`
			`links = [urljoin(base_url, anchor.get("href", "")) for anchor in anchors]`
			`else:`
			`links = [anchor.get("href", "") for anchor in anchors]`
			`# Return the list of links as a JSON string`
			`return json.dumps(links)`

			`def _run(`
			`self,`
			`absolute_urls: bool = False,`
			`run_manager: Optional[CallbackManagerForToolRun] = None,`
			`) -> str:`
			`"""Use the tool."""`
			`if self.sync_browser is None:`
			`raise ValueError(f"Synchronous browser not provided to {self.name}")`
			`page = get_current_page(self.sync_browser)`
			`html_content = page.content()`
			`return self.scrape_page(page, html_content, absolute_urls)`

			`async def _arun(`
			`self,`
			`absolute_urls: bool = False,`
			`run_manager: Optional[AsyncCallbackManagerForToolRun] = None,`
			`) -> str:`
			`"""Use the tool asynchronously."""`
			`if self.async_browser is None:`
			`raise ValueError(f"Asynchronous browser not provided to {self.name}")`
			`page = await aget_current_page(self.async_browser)`
			`html_content = await page.content()`
			`return self.scrape_page(page, html_content, absolute_urls)`