refactor(document_loaders): modify evaluation methods in PlaywrightURLLoader

This commit is contained in:
Youngwook Kim 2023-08-09 10:51:34 +09:00
parent dc4b037957
commit 224263aa24

View File

@ -48,15 +48,19 @@ class PlaywrightURLLoader(BaseLoader):
self.headless = headless self.headless = headless
self.remove_selectors = remove_selectors self.remove_selectors = remove_selectors
def sync_evaluate_page(self, page): def sync_evaluate(self, page, browser, response):
"""Process a page and return the text content synchronously. """Process a page and return the text content synchronously.
Args: Args:
page: The page to process. page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns: Returns:
text: The text content of the page. text: The text content of the page.
""" """
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []: for selector in self.remove_selectors or []:
elements = page.locator(selector).all() elements = page.locator(selector).all()
for element in elements: for element in elements:
@ -68,15 +72,19 @@ class PlaywrightURLLoader(BaseLoader):
text = "\n\n".join([str(el) for el in elements]) text = "\n\n".join([str(el) for el in elements])
return text return text
async def async_evaluate_page(self, page): async def async_evaluate(self, page, browser, response):
"""Process a page and return the text content asynchronously. """Process a page and return the text content asynchronously.
Args: Args:
page: The page to process. page: The page to process.
browser: The browser instance.
response: The response from page.goto().
Returns: Returns:
text: The text content of the page. text: The text content of the page.
""" """
from unstructured.partition.html import partition_html
for selector in self.remove_selectors or []: for selector in self.remove_selectors or []:
elements = await page.locator(selector).all() elements = await page.locator(selector).all()
for element in elements: for element in elements:
@ -95,7 +103,6 @@ class PlaywrightURLLoader(BaseLoader):
List[Document]: A list of Document instances with loaded content. List[Document]: A list of Document instances with loaded content.
""" """
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from unstructured.partition.html import partition_html
docs: List[Document] = list() docs: List[Document] = list()
@ -104,8 +111,8 @@ class PlaywrightURLLoader(BaseLoader):
for url in self.urls: for url in self.urls:
try: try:
page = browser.new_page() page = browser.new_page()
page.goto(url) response = page.goto(url)
text = self.sync_evaluate_page(page) text = self.sync_evaluate(page, browser, response)
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e:
@ -126,7 +133,6 @@ class PlaywrightURLLoader(BaseLoader):
List[Document]: A list of Document instances with loaded content. List[Document]: A list of Document instances with loaded content.
""" """
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from unstructured.partition.html import partition_html
docs: List[Document] = list() docs: List[Document] = list()
@ -135,8 +141,8 @@ class PlaywrightURLLoader(BaseLoader):
for url in self.urls: for url in self.urls:
try: try:
page = await browser.new_page() page = await browser.new_page()
await page.goto(url) response = await page.goto(url)
text = await self.async_evaluate_page(page) text = await self.async_evaluate(page, browser, response)
metadata = {"source": url} metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
except Exception as e: except Exception as e: