forked from Archives/langchain
Harrison/playwright selector (#3185)
Co-authored-by: zhyuri <4649294+zhyuri@users.noreply.github.com>
This commit is contained in:
parent
68cd37175e
commit
9181cd9b22
@ -67,9 +67,10 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
page.goto(url)
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
element = page.locator(selector)
|
||||
if element.is_visible():
|
||||
element.evaluate("element => element.remove()")
|
||||
elements = page.locator(selector).all()
|
||||
for element in elements:
|
||||
if element.is_visible():
|
||||
element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
|
@ -0,0 +1,21 @@
|
||||
"""Tests for the Playwright URL loader"""
|
||||
|
||||
from langchain.document_loaders import PlaywrightURLLoader
|
||||
|
||||
|
||||
def test_playwright_url_loader() -> None:
|
||||
"""Test Playwright URL loader."""
|
||||
urls = [
|
||||
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
||||
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
|
||||
"https://techmeme.com",
|
||||
"https://techcrunch.com",
|
||||
]
|
||||
loader = PlaywrightURLLoader(
|
||||
urls=urls,
|
||||
remove_selectors=["header", "footer"],
|
||||
continue_on_failure=False,
|
||||
headless=True,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) > 0
|
Loading…
Reference in New Issue
Block a user