From e4cfaa5680c41c0ddf8a5dd7b73681fecf4c9669 Mon Sep 17 00:00:00 2001 From: Kevin Huang <44907675+Ynng@users.noreply.github.com> Date: Sun, 2 Apr 2023 17:05:00 -0400 Subject: [PATCH] Introduces SeleniumURLLoader for JavaScript-Dependent Web Page Data Retrieval (#2291) ### Summary This PR introduces a `SeleniumURLLoader` which, similar to `UnstructuredURLLoader`, loads data from URLs. However, it utilizes `selenium` to fetch page content, enabling it to work with JavaScript-rendered pages. The `unstructured` library is also employed for loading the HTML content. ### Testing ```bash pip install selenium pip install unstructured ``` ```python from langchain.document_loaders import SeleniumURLLoader urls = [ "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "https://goo.gl/maps/NDSHwePEyaHMFGwh8" ] loader = SeleniumURLLoader(urls=urls) data = loader.load() ``` --- .../document_loaders/examples/url.ipynb | 60 +++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/url_selenium.py | 118 ++++++++++++++++++ 3 files changed, 180 insertions(+) create mode 100644 langchain/document_loaders/url_selenium.py diff --git a/docs/modules/indexes/document_loaders/examples/url.ipynb b/docs/modules/indexes/document_loaders/examples/url.ipynb index c24c9119..581f1a33 100644 --- a/docs/modules/indexes/document_loaders/examples/url.ipynb +++ b/docs/modules/indexes/document_loaders/examples/url.ipynb @@ -52,6 +52,66 @@ "source": [ "data = loader.load()" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f3afa135", + "metadata": {}, + "source": [ + "# Selenium URL Loader\n", + "\n", + "This covers how to load HTML documents from a list of URLs using the `SeleniumURLLoader`.\n", + "\n", + "Using selenium allows us to load pages that require JavaScript to render.\n", + "\n", + "## Setup\n", + "\n", + "To use the `SeleniumURLLoader`, you will need to install `selenium` and `unstructured`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fc50835", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import SeleniumURLLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24e896ce", + "metadata": {}, + "outputs": [], + "source": [ + "urls = [\n", + " \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n", + " \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a29397", + "metadata": {}, + "outputs": [], + "source": [ + "loader = SeleniumURLLoader(urls=urls)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0090cd57", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] } ], "metadata": { diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 746a94be..31cdfed0 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -58,6 +58,7 @@ from langchain.document_loaders.unstructured import ( UnstructuredFileLoader, ) from langchain.document_loaders.url import UnstructuredURLLoader +from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader @@ -74,6 +75,7 @@ __all__ = [ "UnstructuredFileLoader", "UnstructuredFileIOLoader", "UnstructuredURLLoader", + "SeleniumURLLoader", "DirectoryLoader", "NotionDirectoryLoader", "NotionDBLoader", diff --git a/langchain/document_loaders/url_selenium.py b/langchain/document_loaders/url_selenium.py new file mode 100644 index 00000000..dd0c7152 --- /dev/null +++ b/langchain/document_loaders/url_selenium.py @@ -0,0 +1,118 @@ +"""Loader that uses Selenium to load a page, then uses unstructured to load the html. +""" +import logging +from typing import TYPE_CHECKING, List, Literal, Optional, Union + +if TYPE_CHECKING: + from selenium.webdriver import Chrome, Firefox + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +logger = logging.getLogger(__file__) + + +class SeleniumURLLoader(BaseLoader): + """Loader that uses Selenium and to load a page and unstructured to load the html. + This is useful for loading pages that require javascript to render. + + Attributes: + urls (List[str]): List of URLs to load. + continue_on_failure (bool): If True, continue loading other URLs on failure. + browser (str): The browser to use, either 'chrome' or 'firefox'. + executable_path (Optional[str]): The path to the browser executable. + headless (bool): If True, the browser will run in headless mode. + """ + + def __init__( + self, + urls: List[str], + continue_on_failure: bool = True, + browser: Literal["chrome", "firefox"] = "chrome", + executable_path: Optional[str] = None, + headless: bool = True, + ): + """Load a list of URLs using Selenium and unstructured.""" + try: + import selenium # noqa:F401 + except ImportError: + raise ValueError( + "selenium package not found, please install it with " + "`pip install selenium`" + ) + + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + + self.urls = urls + self.continue_on_failure = continue_on_failure + self.browser = browser + self.executable_path = executable_path + self.headless = headless + + def _get_driver(self) -> Union["Chrome", "Firefox"]: + """Create and return a WebDriver instance based on the specified browser. + + Raises: + ValueError: If an invalid browser is specified. + + Returns: + Union[Chrome, Firefox]: A WebDriver instance for the specified browser. + """ + if self.browser.lower() == "chrome": + from selenium.webdriver import Chrome + from selenium.webdriver.chrome.options import Options as ChromeOptions + + chrome_options = ChromeOptions() + if self.headless: + chrome_options.add_argument("--headless") + if self.executable_path is None: + return Chrome(options=chrome_options) + return Chrome(executable_path=self.executable_path, options=chrome_options) + elif self.browser.lower() == "firefox": + from selenium.webdriver import Firefox + from selenium.webdriver.firefox.options import Options as FirefoxOptions + + firefox_options = FirefoxOptions() + if self.headless: + firefox_options.add_argument("--headless") + if self.executable_path is None: + return Firefox(options=firefox_options) + return Firefox( + executable_path=self.executable_path, options=firefox_options + ) + else: + raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.") + + def load(self) -> List[Document]: + """Load the specified URLs using Selenium and create Document instances. + + Returns: + List[Document]: A list of Document instances with loaded content. + """ + from unstructured.partition.html import partition_html + + docs: List[Document] = list() + driver = self._get_driver() + + for url in self.urls: + try: + driver.get(url) + page_content = driver.page_source + elements = partition_html(text=page_content) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": url} + docs.append(Document(page_content=text, metadata=metadata)) + except Exception as e: + if self.continue_on_failure: + logger.error(f"Error fetching or processing {url}, exception: {e}") + else: + raise e + + driver.quit() + return docs