diff --git a/langchain/document_loaders/url_playwright.py b/langchain/document_loaders/url_playwright.py index 508283c3..15739263 100644 --- a/langchain/document_loaders/url_playwright.py +++ b/langchain/document_loaders/url_playwright.py @@ -67,9 +67,10 @@ class PlaywrightURLLoader(BaseLoader): page.goto(url) for selector in self.remove_selectors or []: - element = page.locator(selector) - if element.is_visible(): - element.evaluate("element => element.remove()") + elements = page.locator(selector).all() + for element in elements: + if element.is_visible(): + element.evaluate("element => element.remove()") page_source = page.content() elements = partition_html(text=page_source) diff --git a/tests/integration_tests/document_loaders/test_url_playwright.py b/tests/integration_tests/document_loaders/test_url_playwright.py new file mode 100644 index 00000000..f24f9286 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_url_playwright.py @@ -0,0 +1,21 @@ +"""Tests for the Playwright URL loader""" + +from langchain.document_loaders import PlaywrightURLLoader + + +def test_playwright_url_loader() -> None: + """Test Playwright URL loader.""" + urls = [ + "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + "https://goo.gl/maps/NDSHwePEyaHMFGwh8", + "https://techmeme.com", + "https://techcrunch.com", + ] + loader = PlaywrightURLLoader( + urls=urls, + remove_selectors=["header", "footer"], + continue_on_failure=False, + headless=True, + ) + docs = loader.load() + assert len(docs) > 0