From 0bff399af1d5bc2f33c79805e3222c1a448bc26a Mon Sep 17 00:00:00 2001
From: Todd Kerpelman <4397978+ToddKerpelman@users.noreply.github.com>
Date: Tue, 10 Oct 2023 20:32:45 -0700
Subject: [PATCH] Make metadata from the url_selenium loader match that of the
 web_base loader (#11617)

**Description:** I noticed the metadata returned by the url_selenium
loader was missing several values included by the web_base loader. (The
former returned `{source: ...}`, the latter returned `{source: ...,
title: ..., description: ..., language: ...}`.) This change fixes it so
both loaders return all 4 key value pairs.

Files have been properly formatted and all tests are passing. Note,
however, that I am not much of a python expert, so that whole "Adding
the imports inside the code so that tests pass" thing seems weird to me.
Please LMK if I did anything wrong.
---
 .../document_loaders/url_selenium.py          | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/libs/langchain/langchain/document_loaders/url_selenium.py b/libs/langchain/langchain/document_loaders/url_selenium.py
index a4419c8b86..a82fa27176 100644
--- a/libs/langchain/langchain/document_loaders/url_selenium.py
+++ b/libs/langchain/langchain/document_loaders/url_selenium.py
@@ -115,6 +115,37 @@ class SeleniumURLLoader(BaseLoader):
         else:
             raise ValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")
 
+    def _build_metadata(self, url: str, driver: Union["Chrome", "Firefox"]) -> dict:
+        from selenium.common.exceptions import NoSuchElementException
+        from selenium.webdriver.common.by import By
+
+        """Build metadata based on the contents of the webpage"""
+        metadata = {
+            "source": url,
+            "title": "No title found.",
+            "description": "No description found.",
+            "language": "No language found.",
+        }
+        if title := driver.title:
+            metadata["title"] = title
+        try:
+            if description := driver.find_element(
+                By.XPATH, '//meta[@name="description"]'
+            ):
+                metadata["description"] = (
+                    description.get_attribute("content") or "No description found."
+                )
+        except NoSuchElementException:
+            pass
+        try:
+            if html_tag := driver.find_element(By.TAG_NAME, "html"):
+                metadata["language"] = (
+                    html_tag.get_attribute("lang") or "No language found."
+                )
+        except NoSuchElementException:
+            pass
+        return metadata
+
     def load(self) -> List[Document]:
         """Load the specified URLs using Selenium and create Document instances.
 
@@ -132,7 +163,7 @@ class SeleniumURLLoader(BaseLoader):
                 page_content = driver.page_source
                 elements = partition_html(text=page_content)
                 text = "\n\n".join([str(el) for el in elements])
-                metadata = {"source": url}
+                metadata = self._build_metadata(url, driver)
                 docs.append(Document(page_content=text, metadata=metadata))
             except Exception as e:
                 if self.continue_on_failure: