Introduces SeleniumURLLoader for JavaScript-Dependent Web Page Data Retrieval (#2291)

### Summary This PR introduces a `SeleniumURLLoader` which, similar to `UnstructuredURLLoader`, loads data from URLs. However, it utilizes `selenium` to fetch page content, enabling it to work with JavaScript-rendered pages. The `unstructured` library is also employed for loading the HTML content. ### Testing ```bash pip install selenium pip install unstructured ``` ```python from langchain.document_loaders import SeleniumURLLoader urls = [ "https://www.youtube.com/watch?v=dQw4w9WgXcQ", "https://goo.gl/maps/NDSHwePEyaHMFGwh8" ] loader = SeleniumURLLoader(urls=urls) data = loader.load() ```
1 year ago · e4cfaa5680
parent 00d3ec5ed8
commit e4cfaa5680
3 changed files with 180 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/url.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/url.ipynb
@ -52,6 +52,66 @@
   "source": [
    "data = loader.load()"
   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f3afa135",
+   "metadata": {},
+   "source": [
+    "# Selenium URL Loader\n",
+    "\n",
+    "This covers how to load HTML documents from a list of URLs using the `SeleniumURLLoader`.\n",
+    "\n",
+    "Using selenium allows us to load pages that require JavaScript to render.\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "To use the `SeleniumURLLoader`, you will need to install `selenium` and `unstructured`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc50835",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SeleniumURLLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24e896ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\n",
+    "    \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n",
+    "    \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60a29397",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = SeleniumURLLoader(urls=urls)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0090cd57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
  }
 ],
 "metadata": {
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -58,6 +58,7 @@ from langchain.document_loaders.unstructured import (
    UnstructuredFileLoader,
 )
 from langchain.document_loaders.url import UnstructuredURLLoader
+from langchain.document_loaders.url_selenium import SeleniumURLLoader
 from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
 from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
@ -74,6 +75,7 @@ __all__ = [
    "UnstructuredFileLoader",
    "UnstructuredFileIOLoader",
    "UnstructuredURLLoader",
+    "SeleniumURLLoader",
    "DirectoryLoader",
    "NotionDirectoryLoader",
    "NotionDBLoader",
--- a/langchain/document_loaders/url_selenium.py
+++ b/langchain/document_loaders/url_selenium.py
@ -0,0 +1,118 @@
+"""Loader that uses Selenium to load a page, then uses unstructured to load the html.
+"""
+import logging
+from typing import TYPE_CHECKING, List, Literal, Optional, Union
+
+if TYPE_CHECKING:
+    from selenium.webdriver import Chrome, Firefox
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__file__)
+
+
+class SeleniumURLLoader(BaseLoader):
+    """Loader that uses Selenium and to load a page and unstructured to load the html.
+    This is useful for loading pages that require javascript to render.
+
+    Attributes:
+        urls (List[str]): List of URLs to load.
+        continue_on_failure (bool): If True, continue loading other URLs on failure.
+        browser (str): The browser to use, either 'chrome' or 'firefox'.
+        executable_path (Optional[str]): The path to the browser executable.
+        headless (bool): If True, the browser will run in headless mode.
+    """
+
+    def __init__(
+        self,
+        urls: List[str],
+        continue_on_failure: bool = True,
+        browser: Literal["chrome", "firefox"] = "chrome",
+        executable_path: Optional[str] = None,
+        headless: bool = True,
+    ):
+        """Load a list of URLs using Selenium and unstructured."""
+        try:
+            import selenium  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "selenium package not found, please install it with "
+                "`pip install selenium`"
+            )
+
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+
+        self.urls = urls
+        self.continue_on_failure = continue_on_failure
+        self.browser = browser
+        self.executable_path = executable_path
+        self.headless = headless
+
+    def _get_driver(self) -> Union["Chrome", "Firefox"]:
+        """Create and return a WebDriver instance based on the specified browser.
+
+        Raises:
+            ValueError: If an invalid browser is specified.
+
+        Returns:
+            Union[Chrome, Firefox]: A WebDriver instance for the specified browser.
+        """
+        if self.browser.lower() == "chrome":
+            from selenium.webdriver import Chrome
+            from selenium.webdriver.chrome.options import Options as ChromeOptions
+
+            chrome_options = ChromeOptions()
+            if self.headless:
+                chrome_options.add_argument("--headless")
+            if self.executable_path is None:
+                return Chrome(options=chrome_options)
+            return Chrome(executable_path=self.executable_path, options=chrome_options)
+        elif self.browser.lower() == "firefox":
+            from selenium.webdriver import Firefox
+            from selenium.webdriver.firefox.options import Options as FirefoxOptions
+
+            firefox_options = FirefoxOptions()
+            if self.headless:
+                firefox_options.add_argument("--headless")
+            if self.executable_path is None:
+                return Firefox(options=firefox_options)
+            return Firefox(
+                executable_path=self.executable_path, options=firefox_options
+            )
+        else:
+            raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")
+
+    def load(self) -> List[Document]:
+        """Load the specified URLs using Selenium and create Document instances.
+
+        Returns:
+            List[Document]: A list of Document instances with loaded content.
+        """
+        from unstructured.partition.html import partition_html
+
+        docs: List[Document] = list()
+        driver = self._get_driver()
+
+        for url in self.urls:
+            try:
+                driver.get(url)
+                page_content = driver.page_source
+                elements = partition_html(text=page_content)
+                text = "\n\n".join([str(el) for el in elements])
+                metadata = {"source": url}
+                docs.append(Document(page_content=text, metadata=metadata))
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching or processing {url}, exception: {e}")
+                else:
+                    raise e
+
+        driver.quit()
+        return docs