mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
refactor(document_loaders): modify evaluation methods in PlaywrightURLLoader
This commit is contained in:
parent
dc4b037957
commit
224263aa24
@ -48,15 +48,19 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.remove_selectors = remove_selectors
|
self.remove_selectors = remove_selectors
|
||||||
|
|
||||||
def sync_evaluate_page(self, page):
|
def sync_evaluate(self, page, browser, response):
|
||||||
"""Process a page and return the text content synchronously.
|
"""Process a page and return the text content synchronously.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
page: The page to process.
|
page: The page to process.
|
||||||
|
browser: The browser instance.
|
||||||
|
response: The response from page.goto().
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
text: The text content of the page.
|
text: The text content of the page.
|
||||||
"""
|
"""
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
for selector in self.remove_selectors or []:
|
for selector in self.remove_selectors or []:
|
||||||
elements = page.locator(selector).all()
|
elements = page.locator(selector).all()
|
||||||
for element in elements:
|
for element in elements:
|
||||||
@ -68,15 +72,19 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
text = "\n\n".join([str(el) for el in elements])
|
text = "\n\n".join([str(el) for el in elements])
|
||||||
return text
|
return text
|
||||||
|
|
||||||
async def async_evaluate_page(self, page):
|
async def async_evaluate(self, page, browser, response):
|
||||||
"""Process a page and return the text content asynchronously.
|
"""Process a page and return the text content asynchronously.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
page: The page to process.
|
page: The page to process.
|
||||||
|
browser: The browser instance.
|
||||||
|
response: The response from page.goto().
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
text: The text content of the page.
|
text: The text content of the page.
|
||||||
"""
|
"""
|
||||||
|
from unstructured.partition.html import partition_html
|
||||||
|
|
||||||
for selector in self.remove_selectors or []:
|
for selector in self.remove_selectors or []:
|
||||||
elements = await page.locator(selector).all()
|
elements = await page.locator(selector).all()
|
||||||
for element in elements:
|
for element in elements:
|
||||||
@ -95,7 +103,6 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
List[Document]: A list of Document instances with loaded content.
|
List[Document]: A list of Document instances with loaded content.
|
||||||
"""
|
"""
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from unstructured.partition.html import partition_html
|
|
||||||
|
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
|
|
||||||
@ -104,8 +111,8 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = browser.new_page()
|
page = browser.new_page()
|
||||||
page.goto(url)
|
response = page.goto(url)
|
||||||
text = self.sync_evaluate_page(page)
|
text = self.sync_evaluate(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -126,7 +133,6 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
List[Document]: A list of Document instances with loaded content.
|
List[Document]: A list of Document instances with loaded content.
|
||||||
"""
|
"""
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
from unstructured.partition.html import partition_html
|
|
||||||
|
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
|
|
||||||
@ -135,8 +141,8 @@ class PlaywrightURLLoader(BaseLoader):
|
|||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
try:
|
try:
|
||||||
page = await browser.new_page()
|
page = await browser.new_page()
|
||||||
await page.goto(url)
|
response = await page.goto(url)
|
||||||
text = await self.async_evaluate_page(page)
|
text = await self.async_evaluate(page, browser, response)
|
||||||
metadata = {"source": url}
|
metadata = {"source": url}
|
||||||
docs.append(Document(page_content=text, metadata=metadata))
|
docs.append(Document(page_content=text, metadata=metadata))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
Reference in New Issue
Block a user