From fea5619ce9f7f9170c7d3c126d4ac34742aeffe8 Mon Sep 17 00:00:00 2001
From: Tim Asp <707699+timothyasp@users.noreply.github.com>
Date: Sat, 15 Apr 2023 16:07:08 -0700
Subject: [PATCH] Add title, lang, description to Web loader document metadata
 (#2955)

Title, lang and description are on almost every web page, and are
incredibly useful pieces of information that currently isn't captured
with the current web base loader

I thought about adding the title and description to the content of the
document, as
that content could be useful in search, but I left it out for right now.
If you think
it'd be worth adding, happy to add it.


I've found it's nice to have the title/description in the metadata to
have some structured data
when retrieving rows from vectordbs for use with summary and source
citation, so if we do want to add it to the `page_content`, i'd advocate
for it to also be included in metadata.
---
 langchain/document_loaders/web_base.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py
index f82e6a92..d51f458b 100644
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@@ -23,6 +23,18 @@ default_header_template = {
 }
 
 
+def _build_metadata(soup: Any, url: str) -> dict:
+    """Build metadata from BeautifulSoup output."""
+    metadata = {"source": url}
+    if title := soup.find("title"):
+        metadata["title"] = title.get_text()
+    if description := soup.find("meta", attrs={"name": "description"}):
+        metadata["description"] = description.get("content", None)
+    if html := soup.find("html"):
+        metadata["language"] = html.get("lang", None)
+    return metadata
+
+
 class WebBaseLoader(BaseLoader):
     """Loader that uses urllib and beautiful soup to load webpages."""
 
@@ -148,7 +160,7 @@ class WebBaseLoader(BaseLoader):
         for path in self.web_paths:
             soup = self._scrape(path)
             text = soup.get_text()
-            metadata = {"source": path}
+            metadata = _build_metadata(soup, path)
             docs.append(Document(page_content=text, metadata=metadata))
 
         return docs
@@ -159,8 +171,9 @@ class WebBaseLoader(BaseLoader):
         results = self.scrape_all(self.web_paths)
         docs = []
         for i in range(len(results)):
-            text = results[i].get_text()
-            metadata = {"source": self.web_paths[i]}
+            soup = results[i]
+            text = soup.get_text()
+            metadata = _build_metadata(soup, self.web_paths[i])
             docs.append(Document(page_content=text, metadata=metadata))
 
         return docs