From e4cfaa5680c41c0ddf8a5dd7b73681fecf4c9669 Mon Sep 17 00:00:00 2001
From: Kevin Huang <44907675+Ynng@users.noreply.github.com>
Date: Sun, 2 Apr 2023 17:05:00 -0400
Subject: [PATCH] Introduces SeleniumURLLoader for JavaScript-Dependent Web
 Page Data Retrieval (#2291)

### Summary
This PR introduces a `SeleniumURLLoader` which, similar to
`UnstructuredURLLoader`, loads data from URLs. However, it utilizes
`selenium` to fetch page content, enabling it to work with
JavaScript-rendered pages. The `unstructured` library is also employed
for loading the HTML content.

### Testing
```bash
pip install selenium
pip install unstructured
```

```python
from langchain.document_loaders import SeleniumURLLoader

urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
    "https://goo.gl/maps/NDSHwePEyaHMFGwh8"
]

loader = SeleniumURLLoader(urls=urls)
data = loader.load()
```
---
 .../document_loaders/examples/url.ipynb       |  60 +++++++++
 langchain/document_loaders/__init__.py        |   2 +
 langchain/document_loaders/url_selenium.py    | 118 ++++++++++++++++++
 3 files changed, 180 insertions(+)
 create mode 100644 langchain/document_loaders/url_selenium.py

diff --git a/docs/modules/indexes/document_loaders/examples/url.ipynb b/docs/modules/indexes/document_loaders/examples/url.ipynb
index c24c9119..581f1a33 100644
--- a/docs/modules/indexes/document_loaders/examples/url.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/url.ipynb
@@ -52,6 +52,66 @@
    "source": [
     "data = loader.load()"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f3afa135",
+   "metadata": {},
+   "source": [
+    "# Selenium URL Loader\n",
+    "\n",
+    "This covers how to load HTML documents from a list of URLs using the `SeleniumURLLoader`.\n",
+    "\n",
+    "Using selenium allows us to load pages that require JavaScript to render.\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "To use the `SeleniumURLLoader`, you will need to install `selenium` and `unstructured`.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fc50835",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SeleniumURLLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24e896ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\n",
+    "    \"https://www.youtube.com/watch?v=dQw4w9WgXcQ\",\n",
+    "    \"https://goo.gl/maps/NDSHwePEyaHMFGwh8\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60a29397",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = SeleniumURLLoader(urls=urls)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0090cd57",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
   }
  ],
  "metadata": {
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 746a94be..31cdfed0 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -58,6 +58,7 @@ from langchain.document_loaders.unstructured import (
     UnstructuredFileLoader,
 )
 from langchain.document_loaders.url import UnstructuredURLLoader
+from langchain.document_loaders.url_selenium import SeleniumURLLoader
 from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
 from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
@@ -74,6 +75,7 @@ __all__ = [
     "UnstructuredFileLoader",
     "UnstructuredFileIOLoader",
     "UnstructuredURLLoader",
+    "SeleniumURLLoader",
     "DirectoryLoader",
     "NotionDirectoryLoader",
     "NotionDBLoader",
diff --git a/langchain/document_loaders/url_selenium.py b/langchain/document_loaders/url_selenium.py
new file mode 100644
index 00000000..dd0c7152
--- /dev/null
+++ b/langchain/document_loaders/url_selenium.py
@@ -0,0 +1,118 @@
+"""Loader that uses Selenium to load a page, then uses unstructured to load the html.
+"""
+import logging
+from typing import TYPE_CHECKING, List, Literal, Optional, Union
+
+if TYPE_CHECKING:
+    from selenium.webdriver import Chrome, Firefox
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__file__)
+
+
+class SeleniumURLLoader(BaseLoader):
+    """Loader that uses Selenium and to load a page and unstructured to load the html.
+    This is useful for loading pages that require javascript to render.
+
+    Attributes:
+        urls (List[str]): List of URLs to load.
+        continue_on_failure (bool): If True, continue loading other URLs on failure.
+        browser (str): The browser to use, either 'chrome' or 'firefox'.
+        executable_path (Optional[str]): The path to the browser executable.
+        headless (bool): If True, the browser will run in headless mode.
+    """
+
+    def __init__(
+        self,
+        urls: List[str],
+        continue_on_failure: bool = True,
+        browser: Literal["chrome", "firefox"] = "chrome",
+        executable_path: Optional[str] = None,
+        headless: bool = True,
+    ):
+        """Load a list of URLs using Selenium and unstructured."""
+        try:
+            import selenium  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "selenium package not found, please install it with "
+                "`pip install selenium`"
+            )
+
+        try:
+            import unstructured  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "unstructured package not found, please install it with "
+                "`pip install unstructured`"
+            )
+
+        self.urls = urls
+        self.continue_on_failure = continue_on_failure
+        self.browser = browser
+        self.executable_path = executable_path
+        self.headless = headless
+
+    def _get_driver(self) -> Union["Chrome", "Firefox"]:
+        """Create and return a WebDriver instance based on the specified browser.
+
+        Raises:
+            ValueError: If an invalid browser is specified.
+
+        Returns:
+            Union[Chrome, Firefox]: A WebDriver instance for the specified browser.
+        """
+        if self.browser.lower() == "chrome":
+            from selenium.webdriver import Chrome
+            from selenium.webdriver.chrome.options import Options as ChromeOptions
+
+            chrome_options = ChromeOptions()
+            if self.headless:
+                chrome_options.add_argument("--headless")
+            if self.executable_path is None:
+                return Chrome(options=chrome_options)
+            return Chrome(executable_path=self.executable_path, options=chrome_options)
+        elif self.browser.lower() == "firefox":
+            from selenium.webdriver import Firefox
+            from selenium.webdriver.firefox.options import Options as FirefoxOptions
+
+            firefox_options = FirefoxOptions()
+            if self.headless:
+                firefox_options.add_argument("--headless")
+            if self.executable_path is None:
+                return Firefox(options=firefox_options)
+            return Firefox(
+                executable_path=self.executable_path, options=firefox_options
+            )
+        else:
+            raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")
+
+    def load(self) -> List[Document]:
+        """Load the specified URLs using Selenium and create Document instances.
+
+        Returns:
+            List[Document]: A list of Document instances with loaded content.
+        """
+        from unstructured.partition.html import partition_html
+
+        docs: List[Document] = list()
+        driver = self._get_driver()
+
+        for url in self.urls:
+            try:
+                driver.get(url)
+                page_content = driver.page_source
+                elements = partition_html(text=page_content)
+                text = "\n\n".join([str(el) for el in elements])
+                metadata = {"source": url}
+                docs.append(Document(page_content=text, metadata=metadata))
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching or processing {url}, exception: {e}")
+                else:
+                    raise e
+
+        driver.quit()
+        return docs