Added matching async load func to PlaywrightURLLoader (#5938)

Fixes # (issue) The existing PlaywrightURLLoader load() function uses a synchronous browser which is not compatible with jupyter. This PR adds a sister function aload() which can be run insisde a notebook. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
1 year ago · c087ce74f7
parent ae7714f1ba
commit c087ce74f7
2 changed files with 60 additions and 0 deletions
--- a/langchain/document_loaders/url_playwright.py
+++ b/langchain/document_loaders/url_playwright.py
@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader):
                        raise e
            browser.close()
        return docs
+
+    async def aload(self) -> List[Document]:
+        """Load the specified URLs with Playwright and create Documents asynchronously.
+        Use this function when in a jupyter notebook environment.
+
+        Returns:
+            List[Document]: A list of Document instances with loaded content.
+        """
+        from playwright.async_api import async_playwright
+        from unstructured.partition.html import partition_html
+
+        docs: List[Document] = list()
+
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=self.headless)
+            for url in self.urls:
+                try:
+                    page = await browser.new_page()
+                    await page.goto(url)
+
+                    for selector in self.remove_selectors or []:
+                        elements = await page.locator(selector).all()
+                        for element in elements:
+                            if await element.is_visible():
+                                await element.evaluate("element => element.remove()")
+
+                    page_source = await page.content()
+                    elements = partition_html(text=page_source)
+                    text = "\n\n".join([str(el) for el in elements])
+                    metadata = {"source": url}
+                    docs.append(Document(page_content=text, metadata=metadata))
+                except Exception as e:
+                    if self.continue_on_failure:
+                        logger.error(
+                            f"Error fetching or processing {url}, exception: {e}"
+                        )
+                    else:
+                        raise e
+            await browser.close()
+        return docs
--- a/tests/integration_tests/document_loaders/test_url_playwright.py
+++ b/tests/integration_tests/document_loaders/test_url_playwright.py
@ -1,4 +1,5 @@
 """Tests for the Playwright URL loader"""
+import pytest

 from langchain.document_loaders import PlaywrightURLLoader

@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None:
    )
    docs = loader.load()
    assert len(docs) > 0
+
+
+@pytest.mark.asyncio
+async def test_playwright_async_url_loader() -> None:
+    """Test Playwright async URL loader."""
+    urls = [
+        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
+        "https://goo.gl/maps/NDSHwePEyaHMFGwh8",
+        "https://techmeme.com",
+        "https://techcrunch.com",
+    ]
+    loader = PlaywrightURLLoader(
+        urls=urls,
+        remove_selectors=["header", "footer"],
+        continue_on_failure=False,
+        headless=True,
+    )
+    docs = await loader.aload()
+    assert len(docs) > 0