mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
45ddf4d26f
- [ ] **PR message**: - **Description:** Refactored the lazy_load method to use asynchronous execution for improved performance. The method now initiates scraping of all URLs simultaneously using asyncio.gather, enhancing data fetching efficiency. Each Document object is yielded immediately once its content becomes available, streamlining the entire process. - **Issue:** N/A - **Dependencies:** Requires the asyncio library for handling asynchronous tasks, which should already be part of standard Python libraries in Python 3.7 and above. - **Email:** [r73327118@gmail.com](mailto:r73327118@gmail.com) --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
import asyncio
|
|
import logging
|
|
from typing import AsyncIterator, Iterator, List
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AsyncChromiumLoader(BaseLoader):
|
|
"""Scrape HTML pages from URLs using a
|
|
headless instance of the Chromium."""
|
|
|
|
def __init__(self, urls: List[str], *, headless: bool = True):
|
|
"""Initialize the loader with a list of URL paths.
|
|
|
|
Args:
|
|
urls: A list of URLs to scrape content from.
|
|
headless: Whether to run browser in headless mode.
|
|
|
|
Raises:
|
|
ImportError: If the required 'playwright' package is not installed.
|
|
"""
|
|
self.urls = urls
|
|
self.headless = headless
|
|
|
|
try:
|
|
import playwright # noqa: F401
|
|
except ImportError:
|
|
raise ImportError(
|
|
"playwright is required for AsyncChromiumLoader. "
|
|
"Please install it with `pip install playwright`."
|
|
)
|
|
|
|
async def ascrape_playwright(self, url: str) -> str:
|
|
"""
|
|
Asynchronously scrape the content of a given URL using Playwright's async API.
|
|
|
|
Args:
|
|
url (str): The URL to scrape.
|
|
|
|
Returns:
|
|
str: The scraped HTML content or an error message if an exception occurs.
|
|
|
|
"""
|
|
from playwright.async_api import async_playwright
|
|
|
|
logger.info("Starting scraping...")
|
|
results = ""
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=self.headless)
|
|
try:
|
|
page = await browser.new_page()
|
|
await page.goto(url)
|
|
results = await page.content() # Simply get the HTML content
|
|
logger.info("Content scraped")
|
|
except Exception as e:
|
|
results = f"Error: {e}"
|
|
await browser.close()
|
|
return results
|
|
|
|
def lazy_load(self) -> Iterator[Document]:
|
|
"""
|
|
Lazily load text content from the provided URLs.
|
|
|
|
This method yields Documents one at a time as they're scraped,
|
|
instead of waiting to scrape all URLs before returning.
|
|
|
|
Yields:
|
|
Document: The scraped content encapsulated within a Document object.
|
|
|
|
"""
|
|
for url in self.urls:
|
|
html_content = asyncio.run(self.ascrape_playwright(url))
|
|
metadata = {"source": url}
|
|
yield Document(page_content=html_content, metadata=metadata)
|
|
|
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
"""
|
|
Asynchronously load text content from the provided URLs.
|
|
|
|
This method leverages asyncio to initiate the scraping of all provided URLs
|
|
simultaneously. It improves performance by utilizing concurrent asynchronous
|
|
requests. Each Document is yielded as soon as its content is available,
|
|
encapsulating the scraped content.
|
|
|
|
Yields:
|
|
Document: A Document object containing the scraped content, along with its
|
|
source URL as metadata.
|
|
"""
|
|
tasks = [self.ascrape_playwright(url) for url in self.urls]
|
|
results = await asyncio.gather(*tasks)
|
|
for url, content in zip(self.urls, results):
|
|
metadata = {"source": url}
|
|
yield Document(page_content=content, metadata=metadata)
|