Harrison/playwright selector (#3185)

Co-authored-by: zhyuri <4649294+zhyuri@users.noreply.github.com>
fix_agent_callbacks
Harrison Chase 1 year ago committed by GitHub
parent 68cd37175e
commit 9181cd9b22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -67,9 +67,10 @@ class PlaywrightURLLoader(BaseLoader):
page.goto(url)
for selector in self.remove_selectors or []:
element = page.locator(selector)
if element.is_visible():
element.evaluate("element => element.remove()")
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)

@ -0,0 +1,21 @@
"""Tests for the Playwright URL loader"""
from langchain.document_loaders import PlaywrightURLLoader
def test_playwright_url_loader() -> None:
"""Test Playwright URL loader."""
urls = [
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
"https://techmeme.com",
"https://techcrunch.com",
]
loader = PlaywrightURLLoader(
urls=urls,
remove_selectors=["header", "footer"],
continue_on_failure=False,
headless=True,
)
docs = loader.load()
assert len(docs) > 0
Loading…
Cancel
Save