mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
refactor(document_loaders): abstract page evaluation logic in PlaywrightURLLoader (#9995)
This PR brings structural updates to `PlaywrightURLLoader`, aiming at making the code more readable and extensible through the abstraction of page evaluation logic. These changes also align this implementation with a similar structure used in LangChain.js. The key enhancements include: 1. Introduction of 'PlaywrightEvaluator', an abstract base class for all evaluators. 2. Creation of 'UnstructuredHtmlEvaluator', a concrete class implementing 'PlaywrightEvaluator', which uses `unstructured` library for processing page's HTML content. 3. Extension of 'PlaywrightURLLoader' constructor to optionally accept an evaluator of the type 'PlaywrightEvaluator'. It defaults to 'UnstructuredHtmlEvaluator' if no evaluator is provided. 4. Refactoring of 'load' and 'aload' methods to use the 'evaluate' and 'evaluate_async' methods of the provided 'PageEvaluator' for page content handling. This update brings flexibility to 'PlaywrightURLLoader' as it can now utilize different evaluators for page processing depending on the requirement. The abstraction also improves code maintainability and readability. Twitter: @ywkim
This commit is contained in:
commit
6b5a970949
@ -1,14 +1,104 @@
|
||||
"""Loader that uses Playwright to load a page, then uses unstructured to load the html.
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
|
||||
from playwright.sync_api import Browser, Page, Response
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PlaywrightEvaluator(ABC):
|
||||
"""Abstract base class for all evaluators.
|
||||
|
||||
Each evaluator should take a page, a browser instance, and a response
|
||||
object, process the page as necessary, and return the resulting text.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
|
||||
"""Synchronously process the page and return the resulting text.
|
||||
|
||||
Args:
|
||||
page: The page to process.
|
||||
browser: The browser instance.
|
||||
response: The response from page.goto().
|
||||
|
||||
Returns:
|
||||
text: The text content of the page.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate_async(
|
||||
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
|
||||
) -> str:
|
||||
"""Asynchronously process the page and return the resulting text.
|
||||
|
||||
Args:
|
||||
page: The page to process.
|
||||
browser: The browser instance.
|
||||
response: The response from page.goto().
|
||||
|
||||
Returns:
|
||||
text: The text content of the page.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class UnstructuredHtmlEvaluator(PlaywrightEvaluator):
|
||||
"""Evaluates the page HTML content using the `unstructured` library."""
|
||||
|
||||
def __init__(self, remove_selectors: Optional[List[str]] = None):
|
||||
"""Initialize UnstructuredHtmlEvaluator."""
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
self.remove_selectors = remove_selectors
|
||||
|
||||
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
|
||||
"""Synchronously process the HTML content of the page."""
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
elements = page.locator(selector).all()
|
||||
for element in elements:
|
||||
if element.is_visible():
|
||||
element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
return "\n\n".join([str(el) for el in elements])
|
||||
|
||||
async def evaluate_async(
|
||||
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
|
||||
) -> str:
|
||||
"""Asynchronously process the HTML content of the page."""
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
elements = await page.locator(selector).all()
|
||||
for element in elements:
|
||||
if await element.is_visible():
|
||||
await element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = await page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
return "\n\n".join([str(el) for el in elements])
|
||||
|
||||
|
||||
class PlaywrightURLLoader(BaseLoader):
|
||||
"""Load `HTML` pages with `Playwright` and parse with `Unstructured`.
|
||||
|
||||
@ -26,8 +116,9 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
continue_on_failure: bool = True,
|
||||
headless: bool = True,
|
||||
remove_selectors: Optional[List[str]] = None,
|
||||
evaluator: Optional[PlaywrightEvaluator] = None,
|
||||
):
|
||||
"""Load a list of URLs using Playwright and unstructured."""
|
||||
"""Load a list of URLs using Playwright."""
|
||||
try:
|
||||
import playwright # noqa:F401
|
||||
except ImportError:
|
||||
@ -36,18 +127,17 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
"`pip install playwright`"
|
||||
)
|
||||
|
||||
try:
|
||||
import unstructured # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"unstructured package not found, please install it with "
|
||||
"`pip install unstructured`"
|
||||
)
|
||||
|
||||
self.urls = urls
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.headless = headless
|
||||
self.remove_selectors = remove_selectors
|
||||
|
||||
if remove_selectors and evaluator:
|
||||
raise ValueError(
|
||||
"`remove_selectors` and `evaluator` cannot be both not None"
|
||||
)
|
||||
|
||||
# Use the provided evaluator, if any, otherwise, use the default.
|
||||
self.evaluator = evaluator or UnstructuredHtmlEvaluator(remove_selectors)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load the specified URLs using Playwright and create Document instances.
|
||||
@ -56,7 +146,6 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
List[Document]: A list of Document instances with loaded content.
|
||||
"""
|
||||
from playwright.sync_api import sync_playwright
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
|
||||
@ -65,17 +154,8 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
for url in self.urls:
|
||||
try:
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
elements = page.locator(selector).all()
|
||||
for element in elements:
|
||||
if element.is_visible():
|
||||
element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
response = page.goto(url)
|
||||
text = self.evaluator.evaluate(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
except Exception as e:
|
||||
@ -96,7 +176,6 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
List[Document]: A list of Document instances with loaded content.
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
docs: List[Document] = list()
|
||||
|
||||
@ -105,17 +184,8 @@ class PlaywrightURLLoader(BaseLoader):
|
||||
for url in self.urls:
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
|
||||
for selector in self.remove_selectors or []:
|
||||
elements = await page.locator(selector).all()
|
||||
for element in elements:
|
||||
if await element.is_visible():
|
||||
await element.evaluate("element => element.remove()")
|
||||
|
||||
page_source = await page.content()
|
||||
elements = partition_html(text=page_source)
|
||||
text = "\n\n".join([str(el) for el in elements])
|
||||
response = await page.goto(url)
|
||||
text = await self.evaluator.evaluate_async(page, browser, response)
|
||||
metadata = {"source": url}
|
||||
docs.append(Document(page_content=text, metadata=metadata))
|
||||
except Exception as e:
|
||||
|
@ -1,7 +1,26 @@
|
||||
"""Tests for the Playwright URL loader"""
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders import PlaywrightURLLoader
|
||||
from langchain.document_loaders.url_playwright import PlaywrightEvaluator
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
|
||||
from playwright.sync_api import Browser, Page, Response
|
||||
|
||||
|
||||
class TestEvaluator(PlaywrightEvaluator):
|
||||
"""A simple evaluator for testing purposes."""
|
||||
|
||||
def evaluate(self, page: "Page", browser: "Browser", response: "Response") -> str:
|
||||
return "test"
|
||||
|
||||
async def evaluate_async(
|
||||
self, page: "AsyncPage", browser: "AsyncBrowser", response: "AsyncResponse"
|
||||
) -> str:
|
||||
return "test"
|
||||
|
||||
|
||||
def test_playwright_url_loader() -> None:
|
||||
@ -39,3 +58,32 @@ async def test_playwright_async_url_loader() -> None:
|
||||
)
|
||||
docs = await loader.aload()
|
||||
assert len(docs) > 0
|
||||
|
||||
|
||||
def test_playwright_url_loader_with_custom_evaluator() -> None:
|
||||
"""Test Playwright URL loader with a custom evaluator."""
|
||||
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
|
||||
loader = PlaywrightURLLoader(
|
||||
urls=urls,
|
||||
evaluator=TestEvaluator(),
|
||||
continue_on_failure=False,
|
||||
headless=True,
|
||||
)
|
||||
docs = loader.load()
|
||||
assert len(docs) == 1
|
||||
assert docs[0].page_content == "test"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_playwright_async_url_loader_with_custom_evaluator() -> None:
|
||||
"""Test Playwright async URL loader with a custom evaluator."""
|
||||
urls = ["https://www.youtube.com/watch?v=dQw4w9WgXcQ"]
|
||||
loader = PlaywrightURLLoader(
|
||||
urls=urls,
|
||||
evaluator=TestEvaluator(),
|
||||
continue_on_failure=False,
|
||||
headless=True,
|
||||
)
|
||||
docs = await loader.aload()
|
||||
assert len(docs) == 1
|
||||
assert docs[0].page_content == "test"
|
||||
|
Loading…
Reference in New Issue
Block a user