Added matching async load func to PlaywrightURLLoader (#5938)

Fixes # (issue)

The existing PlaywrightURLLoader load() function uses a synchronous
browser which is not compatible with jupyter.
This PR adds a sister function aload() which can be run insisde a
notebook.

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
EllieRoseS 2023-07-13 22:51:38 +01:00 committed by GitHub
parent ae7714f1ba
commit c087ce74f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 60 additions and 0 deletions

View File

@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader):
raise e raise e
browser.close() browser.close()
return docs return docs
async def aload(self) -> List[Document]:
"""Load the specified URLs with Playwright and create Documents asynchronously.
Use this function when in a jupyter notebook environment.
Returns:
List[Document]: A list of Document instances with loaded content.
"""
from playwright.async_api import async_playwright
from unstructured.partition.html import partition_html
docs: List[Document] = list()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=self.headless)
for url in self.urls:
try:
page = await browser.new_page()
await page.goto(url)
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
if self.continue_on_failure:
logger.error(
f"Error fetching or processing {url}, exception: {e}"
)
else:
raise e
await browser.close()
return docs

View File

@ -1,4 +1,5 @@
"""Tests for the Playwright URL loader""" """Tests for the Playwright URL loader"""
import pytest
from langchain.document_loaders import PlaywrightURLLoader from langchain.document_loaders import PlaywrightURLLoader
@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None:
) )
docs = loader.load() docs = loader.load()
assert len(docs) > 0 assert len(docs) > 0
@pytest.mark.asyncio
async def test_playwright_async_url_loader() -> None:
"""Test Playwright async URL loader."""
urls = [
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
"https://techmeme.com",
"https://techcrunch.com",
]
loader = PlaywrightURLLoader(
urls=urls,
remove_selectors=["header", "footer"],
continue_on_failure=False,
headless=True,
)
docs = await loader.aload()
assert len(docs) > 0