Add title, lang, description to Web loader document metadata (#2955)

Title, lang and description are on almost every web page, and are incredibly useful pieces of information that currently isn't captured with the current web base loader I thought about adding the title and description to the content of the document, as that content could be useful in search, but I left it out for right now. If you think it'd be worth adding, happy to add it. I've found it's nice to have the title/description in the metadata to have some structured data when retrieving rows from vectordbs for use with summary and source citation, so if we do want to add it to the `page_content`, i'd advocate for it to also be included in metadata.
1 year ago · fea5619ce9
parent f7bf917baf
commit fea5619ce9
1 changed files with 16 additions and 3 deletions
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@ -23,6 +23,18 @@ default_header_template = {
 }


+def _build_metadata(soup: Any, url: str) -> dict:
+    """Build metadata from BeautifulSoup output."""
+    metadata = {"source": url}
+    if title := soup.find("title"):
+        metadata["title"] = title.get_text()
+    if description := soup.find("meta", attrs={"name": "description"}):
+        metadata["description"] = description.get("content", None)
+    if html := soup.find("html"):
+        metadata["language"] = html.get("lang", None)
+    return metadata
+
+
 class WebBaseLoader(BaseLoader):
    """Loader that uses urllib and beautiful soup to load webpages."""

@ -148,7 +160,7 @@ class WebBaseLoader(BaseLoader):
        for path in self.web_paths:
            soup = self._scrape(path)
            text = soup.get_text()
-            metadata = {"source": path}
+            metadata = _build_metadata(soup, path)
            docs.append(Document(page_content=text, metadata=metadata))

        return docs
@ -159,8 +171,9 @@ class WebBaseLoader(BaseLoader):
        results = self.scrape_all(self.web_paths)
        docs = []
        for i in range(len(results)):
-            text = results[i].get_text()
-            metadata = {"source": self.web_paths[i]}
+            soup = results[i]
+            text = soup.get_text()
+            metadata = _build_metadata(soup, self.web_paths[i])
            docs.append(Document(page_content=text, metadata=metadata))

        return docs