forked from Archives/langchain
Harrison/playwright selector (#3185)
Co-authored-by: zhyuri <4649294+zhyuri@users.noreply.github.com>
This commit is contained in:
parent
68cd37175e
commit
9181cd9b22
@ -67,7 +67,8 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
page.goto(url)
|
page.goto(url)
|
||||||
|
|
||||||
for selector in self.remove_selectors or []:
|
for selector in self.remove_selectors or []:
|
||||||
element = page.locator(selector)
|
elements = page.locator(selector).all()
|
||||||
|
for element in elements:
|
||||||
if element.is_visible():
|
if element.is_visible():
|
||||||
element.evaluate("element => element.remove()")
|
element.evaluate("element => element.remove()")
|
||||||
|
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
"""Tests for the Playwright URL loader"""
|
||||||
|
|
||||||
|
from langchain.document_loaders import PlaywrightURLLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_playwright_url_loader() -> None:
|
||||||
|
"""Test Playwright URL loader."""
|
||||||
|
urls = [
|
||||||
|
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
||||||
|
"https://goo.gl/maps/NDSHwePEyaHMFGwh8",
|
||||||
|
"https://techmeme.com",
|
||||||
|
"https://techcrunch.com",
|
||||||
|
]
|
||||||
|
loader = PlaywrightURLLoader(
|
||||||
|
urls=urls,
|
||||||
|
remove_selectors=["header", "footer"],
|
||||||
|
continue_on_failure=False,
|
||||||
|
headless=True,
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
assert len(docs) > 0
|
Loading…
Reference in New Issue
Block a user