mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Added matching async load func to PlaywrightURLLoader (#5938)
Fixes # (issue) The existing PlaywrightURLLoader load() function uses a synchronous browser which is not compatible with jupyter. This PR adds a sister function aload() which can be run insisde a notebook. --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
ae7714f1ba
commit
c087ce74f7
@ -86,3 +86,43 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
raise e
|
raise e
|
||||||
browser.close()
|
browser.close()
|
||||||
return docs
|
return docs
|
||||||
|
|
||||||
|
async def aload(self) -> List[Document]:
|
||||||
|
"""Load the specified URLs with Playwright and create Documents asynchronously.
|
||||||
|
Use this function when in a jupyter notebook environment.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document instances with loaded content.
|
||||||
|
"""
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
|
docs: List[Document] = list()
|
||||||
|
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(headless=self.headless)
|
||||||
|
for url in self.urls:
|
||||||
|
try:
|
||||||
|
page = await browser.new_page()
|
||||||
|
await page.goto(url)
|
||||||
|
|
||||||
|
for selector in self.remove_selectors or []:
|
||||||
|
elements = await page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
|
if await element.is_visible():
|
||||||
|
await element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
page_source = await page.content()
|
||||||
|
elements = partition_html(text=page_source)
|
||||||
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
|
metadata = {"source": url}
|
||||||
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
|
except Exception as e:
|
||||||
|
if self.continue_on_failure:
|
||||||
|
logger.error(
|
||||||
|
f"Error fetching or processing {url}, exception: {e}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
await browser.close()
|
||||||
|
return docs
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
"""Tests for the Playwright URL loader"""
|
"""Tests for the Playwright URL loader"""
|
||||||
|
import pytest
|
||||||
|
|
||||||
from langchain.document_loaders import PlaywrightURLLoader
|
from langchain.document_loaders import PlaywrightURLLoader
|
||||||
|
|
||||||
@ -19,3 +20,22 @@ def test_playwright_url_loader() -> None:
|
|||||||
)
|
)
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
assert len(docs) > 0
|
assert len(docs) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_playwright_async_url_loader() -> None:
|
||||||
|
"""Test Playwright async URL loader."""
|
||||||
|
urls = [
|
||||||
|
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
||||||
|
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
|
||||||
|
"https://techmeme.com",
|
||||||
|
"https://techcrunch.com",
|
||||||
|
]
|
||||||
|
loader = PlaywrightURLLoader(
|
||||||
|
urls=urls,
|
||||||
|
remove_selectors=["header", "footer"],
|
||||||
|
continue_on_failure=False,
|
||||||
|
headless=True,
|
||||||
|
)
|
||||||
|
docs = await loader.aload()
|
||||||
|
assert len(docs) > 0
|
||||||
|
Loading…
Reference in New Issue
Block a user