langchain/libs/community/langchain_community/tools/playwright/extract_hyperlinks.py
Bagatur ed58eeb9c5
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion:

```
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
```

Moved the following to core
```
mv langchain/langchain/utils/json_schema.py core/langchain_core/utils
mv langchain/langchain/utils/html.py core/langchain_core/utils
mv langchain/langchain/utils/strings.py core/langchain_core/utils
cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py
rm langchain/langchain/utils/env.py
```

See .scripts/community_split/script_integrations.sh for all changes
2023-12-11 13:53:30 -08:00

92 lines
3.0 KiB
Python

from __future__ import annotations
import json
from typing import TYPE_CHECKING, Any, Optional, Type
from langchain_core.callbacks import (
AsyncCallbackManagerForToolRun,
CallbackManagerForToolRun,
)
from langchain_core.pydantic_v1 import BaseModel, Field, root_validator
from langchain_community.tools.playwright.base import BaseBrowserTool
from langchain_community.tools.playwright.utils import (
aget_current_page,
get_current_page,
)
if TYPE_CHECKING:
pass
class ExtractHyperlinksToolInput(BaseModel):
"""Input for ExtractHyperlinksTool."""
absolute_urls: bool = Field(
default=False,
description="Return absolute URLs instead of relative URLs",
)
class ExtractHyperlinksTool(BaseBrowserTool):
"""Extract all hyperlinks on the page."""
name: str = "extract_hyperlinks"
description: str = "Extract all hyperlinks on the current webpage"
args_schema: Type[BaseModel] = ExtractHyperlinksToolInput
@root_validator
def check_bs_import(cls, values: dict) -> dict:
"""Check that the arguments are valid."""
try:
from bs4 import BeautifulSoup # noqa: F401
except ImportError:
raise ImportError(
"The 'beautifulsoup4' package is required to use this tool."
" Please install it with 'pip install beautifulsoup4'."
)
return values
@staticmethod
def scrape_page(page: Any, html_content: str, absolute_urls: bool) -> str:
from urllib.parse import urljoin
from bs4 import BeautifulSoup
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, "lxml")
# Find all the anchor elements and extract their href attributes
anchors = soup.find_all("a")
if absolute_urls:
base_url = page.url
links = [urljoin(base_url, anchor.get("href", "")) for anchor in anchors]
else:
links = [anchor.get("href", "") for anchor in anchors]
# Return the list of links as a JSON string
return json.dumps(links)
def _run(
self,
absolute_urls: bool = False,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str:
"""Use the tool."""
if self.sync_browser is None:
raise ValueError(f"Synchronous browser not provided to {self.name}")
page = get_current_page(self.sync_browser)
html_content = page.content()
return self.scrape_page(page, html_content, absolute_urls)
async def _arun(
self,
absolute_urls: bool = False,
run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
) -> str:
"""Use the tool asynchronously."""
if self.async_browser is None:
raise ValueError(f"Asynchronous browser not provided to {self.name}")
page = await aget_current_page(self.async_browser)
html_content = await page.content()
return self.scrape_page(page, html_content, absolute_urls)