langchain/libs/community/langchain_community/document_loaders/chromium.py
Emilien Chauvet c3d4126eb1
community[minor]: add user agent for web scraping loaders (#22480)
**Description:** This PR adds a `USER_AGENT` env variable that is to be
used for web scraping. It creates a util to get that user agent and uses
it in the classes used for scraping in [this piece of
doc](https://python.langchain.com/v0.1/docs/use_cases/web_scraping/).
Identifying your scraper is considered a good politeness practice, this
PR aims at easing it.
**Issue:** `None`
**Dependencies:** `None`
**Twitter handle:** `None`
2024-06-05 15:20:34 +00:00

107 lines
3.6 KiB
Python

import asyncio
import logging
from typing import AsyncIterator, Iterator, List, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.utils.user_agent import get_user_agent
logger = logging.getLogger(__name__)
class AsyncChromiumLoader(BaseLoader):
"""Scrape HTML pages from URLs using a
headless instance of the Chromium."""
def __init__(
self,
urls: List[str],
*,
headless: bool = True,
user_agent: Optional[str] = None,
):
"""Initialize the loader with a list of URL paths.
Args:
urls: A list of URLs to scrape content from.
headless: Whether to run browser in headless mode.
user_agent: The user agent to use for the browser
Raises:
ImportError: If the required 'playwright' package is not installed.
"""
self.urls = urls
self.headless = headless
self.user_agent = user_agent or get_user_agent()
try:
import playwright # noqa: F401
except ImportError:
raise ImportError(
"playwright is required for AsyncChromiumLoader. "
"Please install it with `pip install playwright`."
)
async def ascrape_playwright(self, url: str) -> str:
"""
Asynchronously scrape the content of a given URL using Playwright's async API.
Args:
url (str): The URL to scrape.
Returns:
str: The scraped HTML content or an error message if an exception occurs.
"""
from playwright.async_api import async_playwright
logger.info("Starting scraping...")
results = ""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=self.headless)
try:
page = await browser.new_page(user_agent=self.user_agent)
await page.goto(url)
results = await page.content() # Simply get the HTML content
logger.info("Content scraped")
except Exception as e:
results = f"Error: {e}"
await browser.close()
return results
def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.
This method yields Documents one at a time as they're scraped,
instead of waiting to scrape all URLs before returning.
Yields:
Document: The scraped content encapsulated within a Document object.
"""
for url in self.urls:
html_content = asyncio.run(self.ascrape_playwright(url))
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)
async def alazy_load(self) -> AsyncIterator[Document]:
"""
Asynchronously load text content from the provided URLs.
This method leverages asyncio to initiate the scraping of all provided URLs
simultaneously. It improves performance by utilizing concurrent asynchronous
requests. Each Document is yielded as soon as its content is available,
encapsulating the scraped content.
Yields:
Document: A Document object containing the scraped content, along with its
source URL as metadata.
"""
tasks = [self.ascrape_playwright(url) for url in self.urls]
results = await asyncio.gather(*tasks)
for url, content in zip(self.urls, results):
metadata = {"source": url}
yield Document(page_content=content, metadata=metadata)