Added matching async load func to PlaywrightURLLoader (#5938)

Fixes # (issue) The existing PlaywrightURLLoader load() function uses a synchronous browser which is not compatible with jupyter. This PR adds a sister function aload() which can be run insisde a notebook. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2024-11-08 07:10:35 +00:00 · 2023-07-13 22:51:38 +01:00 · 2023-07-13 22:51:38 +01:00 · c087ce74f7
commit c087ce74f7
parent ae7714f1ba
2 changed files with 60 additions and 0 deletions
--- a/langchain/document_loaders/url_playwright.py
+++ b/langchain/document_loaders/url_playwright.py
@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader):
                        raise e
            browser.close()
        return docs
    async def aload(self) -> List[Document]:
        """Load the specified URLs with Playwright and create Documents asynchronously.
        Use this function when in a jupyter notebook environment.
        Returns:
            List[Document]: A list of Document instances with loaded content.
        """
        from playwright.async_api import async_playwright
        from unstructured.partition.html import partition_html
        docs: List[Document] = list()
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=self.headless)
            for url in self.urls:
                try:
                    page = await browser.new_page()
                    await page.goto(url)
                    for selector in self.remove_selectors or []:
                        elements = await page.locator(selector).all()
                        for element in elements:
                            if await element.is_visible():
                                await element.evaluate("element => element.remove()")
                    page_source = await page.content()
                    elements = partition_html(text=page_source)
                    text = "\n\n".join([str(el) for el in elements])
                    metadata = {"source": url}
                    docs.append(Document(page_content=text, metadata=metadata))
                except Exception as e:
                    if self.continue_on_failure:
                        logger.error(
                            f"Error fetching or processing {url}, exception: {e}"
                        )
                    else:
                        raise e
            await browser.close()
        return docs
--- a/tests/integration_tests/document_loaders/test_url_playwright.py
+++ b/tests/integration_tests/document_loaders/test_url_playwright.py
@ -1,4 +1,5 @@
 """Tests for the Playwright URL loader"""
 import pytest
 from langchain.document_loaders import PlaywrightURLLoader
@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None:
    )
    docs = loader.load()
    assert len(docs) > 0
@pytest.mark.asyncio
 async def test_playwright_async_url_loader() -> None:
    """Test Playwright async URL loader."""
    urls = [
        "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
        "https://goo.gl/maps/NDSHwePEyaHMFGwh8",
        "https://techmeme.com",
        "https://techcrunch.com",
    ]
    loader = PlaywrightURLLoader(
        urls=urls,
        remove_selectors=["header", "footer"],
        continue_on_failure=False,
        headless=True,
    )
    docs = await loader.aload()
    assert len(docs) > 0