diff --git a/langchain/document_loaders/url_playwright.py b/langchain/document_loaders/url_playwright.py index d68cf105ab..ad0d8f006c 100644 --- a/langchain/document_loaders/url_playwright.py +++ b/langchain/document_loaders/url_playwright.py @@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader): raise e browser.close() return docs + + async def aload(self) -> List[Document]: + """Load the specified URLs with Playwright and create Documents asynchronously. + Use this function when in a jupyter notebook environment. + + Returns: + List[Document]: A list of Document instances with loaded content. + """ + from playwright.async_api import async_playwright + from unstructured.partition.html import partition_html + + docs: List[Document] = list() + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=self.headless) + for url in self.urls: + try: + page = await browser.new_page() + await page.goto(url) + + for selector in self.remove_selectors or []: + elements = await page.locator(selector).all() + for element in elements: + if await element.is_visible(): + await element.evaluate("element => element.remove()") + + page_source = await page.content() + elements = partition_html(text=page_source) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": url} + docs.append(Document(page_content=text, metadata=metadata)) + except Exception as e: + if self.continue_on_failure: + logger.error( + f"Error fetching or processing {url}, exception: {e}" + ) + else: + raise e + await browser.close() + return docs diff --git a/tests/integration_tests/document_loaders/test_url_playwright.py b/tests/integration_tests/document_loaders/test_url_playwright.py index f24f928605..565646428b 100644 --- a/tests/integration_tests/document_loaders/test_url_playwright.py +++ b/tests/integration_tests/document_loaders/test_url_playwright.py @@ -1,4 +1,5 @@ """Tests for the Playwright URL loader""" +import pytest from langchain.document_loaders import PlaywrightURLLoader @@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None: ) docs = loader.load() assert len(docs) > 0 + + +@pytest.mark.asyncio +async def test_playwright_async_url_loader() -> None: + """Test Playwright async URL loader.""" + urls = [ + "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + "https://goo.gl/maps/NDSHwePEyaHMFGwh8", + "https://techmeme.com", + "https://techcrunch.com", + ] + loader = PlaywrightURLLoader( + urls=urls, + remove_selectors=["header", "footer"], + continue_on_failure=False, + headless=True, + ) + docs = await loader.aload() + assert len(docs) > 0