refactor(document_loaders): abstract page evaluation logic in PlaywrightURLLoader (#9995)

This PR brings structural updates to `PlaywrightURLLoader`, aiming at
making the code more readable and extensible through the abstraction of
page evaluation logic. These changes also align this implementation with
a similar structure used in LangChain.js.

The key enhancements include:

1. Introduction of 'PlaywrightEvaluator', an abstract base class for all
evaluators.
2. Creation of 'UnstructuredHtmlEvaluator', a concrete class
implementing 'PlaywrightEvaluator', which uses `unstructured` library
for processing page's HTML content.
3. Extension of 'PlaywrightURLLoader' constructor to optionally accept
an evaluator of the type 'PlaywrightEvaluator'. It defaults to
'UnstructuredHtmlEvaluator' if no evaluator is provided.
4. Refactoring of 'load' and 'aload' methods to use the 'evaluate' and
'evaluate_async' methods of the provided 'PageEvaluator' for page
content handling.

This update brings flexibility to 'PlaywrightURLLoader' as it can now
utilize different evaluators for page processing depending on the
requirement. The abstraction also improves code maintainability and
readability.

Twitter: @ywkim
This commit is contained in:
Bagatur 2023-08-31 00:45:33 -07:00 committed by GitHub
commit 6b5a970949
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 153 additions and 35 deletions

View File

@ -1,14 +1,104 @@
"""Loader that uses Playwright to load a page, then uses unstructured to load the html.
"""
import logging
from typing import List, Optional
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.sync_api import Browser, Page, Response
logger = logging.getLogger(__name__)
class PlaywrightEvaluator(ABC):
"""Abstract base class for all evaluators.
Each evaluator should take a page, a browser instance, and a response
object, process the page as necessary, and return the resulting text.
"""
@abstractmethod
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
"""Synchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass
@abstractmethod
async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
"""Asynchronously process the page and return the resulting text.
Args:
page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns:
text: The text content of the page.
"""
pass
class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
"""Evaluates the page HTML content using the `unstructured` library."""
def __init__(self, remove_selectors: Optional[List[str]] = None):
"""Initialize UnstructuredHtmlEvaluator."""
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.remove_selectors = remove_selectors
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
"""Synchronously process the HTML content of the page."""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])
async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
"""Asynchronously process the HTML content of the page."""
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
return "\n\n".join([str(el) for el in elements])
class PlaywrightURLLoader(BaseLoader):
"""Load `HTML` pages with `Playwright` and parse with `Unstructured`.
@ -26,8 +116,9 @@ class PlaywrightURLLoader(BaseLoader):
continue_on_failure: bool = True,
headless: bool = True,
remove_selectors: Optional[List[str]] = None,
evaluator: Optional[PlaywrightEvaluator] = None,
):
"""Load a list of URLs using Playwright and unstructured."""
"""Load a list of URLs using Playwright."""
try:
import playwright # noqa:F401
except ImportError:
@ -36,18 +127,17 @@ class PlaywrightURLLoader(BaseLoader):
"`pip install playwright`"
)
try:
import unstructured # noqa:F401
except ImportError:
raise ImportError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.urls = urls
self.continue_on_failure = continue_on_failure
self.headless = headless
self.remove_selectors = remove_selectors
if remove_selectors and evaluator:
raise ValueError(
"`remove_selectors` and `evaluator` cannot be both not None"
)
# Use the provided evaluator, if any, otherwise, use the default.
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
def load(self) -> List[Document]:
"""Load the specified URLs using Playwright and create Document instances.
@ -56,7 +146,6 @@ class PlaywrightURLLoader(BaseLoader):
List[Document]: A list of Document instances with loaded content.
"""
from playwright.sync_api import sync_playwright
from unstructured.partition.html import partition_html
docs: List[Document] = list()
@ -65,17 +154,8 @@ class PlaywrightURLLoader(BaseLoader):
for url in self.urls:
try:
page = browser.new_page()
page.goto(url)
for selector in self.remove_selectors or []:
elements = page.locator(selector).all()
for element in elements:
if element.is_visible():
element.evaluate("element => element.remove()")
page_source = page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
response = page.goto(url)
text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
@ -96,7 +176,6 @@ class PlaywrightURLLoader(BaseLoader):
List[Document]: A list of Document instances with loaded content.
"""
from playwright.async_api import async_playwright
from unstructured.partition.html import partition_html
docs: List[Document] = list()
@ -105,17 +184,8 @@ class PlaywrightURLLoader(BaseLoader):
for url in self.urls:
try:
page = await browser.new_page()
await page.goto(url)
for selector in self.remove_selectors or []:
elements = await page.locator(selector).all()
for element in elements:
if await element.is_visible():
await element.evaluate("element => element.remove()")
page_source = await page.content()
elements = partition_html(text=page_source)
text = "\n\n".join([str(el) for el in elements])
response = await page.goto(url)
text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:

View File

@ -1,7 +1,26 @@
"""Tests for the Playwright URL loader"""
from typing import TYPE_CHECKING
import pytest
from langchain.document_loaders import PlaywrightURLLoader
from langchain.document_loaders.url_playwright import PlaywrightEvaluator
if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.sync_api import Browser, Page, Response
class TestEvaluator(PlaywrightEvaluator):
"""A simple evaluator for testing purposes."""
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
return "test"
async def evaluate_async(
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
) -> str:
return "test"
def test_playwright_url_loader() -> None:
@ -39,3 +58,32 @@ async def test_playwright_async_url_loader() -> None:
)
docs = await loader.aload()
assert len(docs) > 0
def test_playwright_url_loader_with_custom_evaluator() -> None:
"""Test Playwright URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = loader.load()
assert len(docs) == 1
assert docs[0].page_content == "test"
@pytest.mark.asyncio
async def test_playwright_async_url_loader_with_custom_evaluator() -> None:
"""Test Playwright async URL loader with a custom evaluator."""
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
loader = PlaywrightURLLoader(
urls=urls,
evaluator=TestEvaluator(),
continue_on_failure=False,
headless=True,
)
docs = await loader.aload()
assert len(docs) == 1
assert docs[0].page_content == "test"