From b157e0c1c357b36b5bb0409409837acb9af7d91a Mon Sep 17 00:00:00 2001
From: Daniel Chalef <131175+danielchalef@users.noreply.github.com>
Date: Thu, 16 Mar 2023 21:47:17 -0700
Subject: [PATCH] Add HTML document_loader that includes page title metadata
 (#1720)

This `BSHTMLLoader` document_loader loads an HTML document, extracts
text and adds the page title to the returned Document's metadata. The
loader uses the already installed bs4 package to extract both text
content and the page title.

Included in this PR is an example HTML file and an integration test that
tests against this file.

---------

Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
---
 langchain/document_loaders/__init__.py        |  2 ++
 langchain/document_loaders/directory.py       |  5 ++-
 langchain/document_loaders/html_bs.py         | 36 +++++++++++++++++++
 .../document_loaders/test_bshtml.py           | 17 +++++++++
 tests/integration_tests/examples/example.html | 25 +++++++++++++
 5 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 langchain/document_loaders/html_bs.py
 create mode 100644 tests/integration_tests/document_loaders/test_bshtml.py
 create mode 100644 tests/integration_tests/examples/example.html

diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 1c206623..78c07776 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -17,6 +17,7 @@ from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.hn import HNLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
+from langchain.document_loaders.html_bs import BSHTMLLoader
 from langchain.document_loaders.ifixit import IFixitLoader
 from langchain.document_loaders.image import UnstructuredImageLoader
 from langchain.document_loaders.imsdb import IMSDbLoader
@@ -64,6 +65,7 @@ __all__ = [
     "ReadTheDocsLoader",
     "GoogleDriveLoader",
     "UnstructuredHTMLLoader",
+    "BSHTMLLoader",
     "UnstructuredPowerPointLoader",
     "UnstructuredWordDocumentLoader",
     "UnstructuredPDFLoader",
diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py
index 7c77daa9..ecffa85e 100644
--- a/langchain/document_loaders/directory.py
+++ b/langchain/document_loaders/directory.py
@@ -5,10 +5,13 @@ from typing import List, Type, Union
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.html_bs import BSHTMLLoader
 from langchain.document_loaders.text import TextLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
 
-FILE_LOADER_TYPE = Union[Type[UnstructuredFileLoader], Type[TextLoader]]
+FILE_LOADER_TYPE = Union[
+    Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
+]
 logger = logging.getLogger(__file__)
 
 
diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py
new file mode 100644
index 00000000..92802ccb
--- /dev/null
+++ b/langchain/document_loaders/html_bs.py
@@ -0,0 +1,36 @@
+"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
+
+import logging
+from typing import Dict, List, Union
+
+from bs4 import BeautifulSoup
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__file__)
+
+
+class BSHTMLLoader(BaseLoader):
+    """Loader that uses beautiful soup to parse HTML files."""
+
+    def __init__(self, file_path: str) -> None:
+        self.file_path = file_path
+
+    def load(self) -> List[Document]:
+        """Load HTML document into document objects."""
+        with open(self.file_path, "r") as f:
+            soup = BeautifulSoup(f, features="lxml")
+
+        text = soup.get_text()
+
+        if soup.title:
+            title = str(soup.title.string)
+        else:
+            title = ""
+
+        metadata: Dict[str, Union[str, None]] = {
+            "source": self.file_path,
+            "title": title,
+        }
+        return [Document(page_content=text, metadata=metadata)]
diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/integration_tests/document_loaders/test_bshtml.py
new file mode 100644
index 00000000..7e67d621
--- /dev/null
+++ b/tests/integration_tests/document_loaders/test_bshtml.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+from langchain.document_loaders.html_bs import BSHTMLLoader
+
+
+def test_bs_html_loader() -> None:
+    """Test unstructured loader."""
+    file_path = Path(__file__).parent.parent / "examples/example.html"
+    loader = BSHTMLLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+
+    assert metadata["title"] == "Chew dad's slippers"
+    assert metadata["source"] == str(file_path)
diff --git a/tests/integration_tests/examples/example.html b/tests/integration_tests/examples/example.html
new file mode 100644
index 00000000..b9318b7a
--- /dev/null
+++ b/tests/integration_tests/examples/example.html
@@ -0,0 +1,25 @@
+<html>
+  <head>
+    <title>Chew dad's slippers</title>
+  </head>
+  <body>
+    <h1>
+      Instead of drinking water from the cat bowl, make sure to steal water from
+      the toilet
+    </h1>
+    <h2>Chase the red dot</h2>
+    <p>
+      Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,
+      destroy all furniture, especially couch get scared by sudden appearance of
+      cucumber cat is love, cat is life fat baby cat best buddy little guy for
+      catch eat throw up catch eat throw up bad birds jump on fridge. Purr like
+      a car engine oh yes, there is my human woman she does best pats ever that
+      all i like about her hiss meow .
+    </p>
+    <p>
+      Dead stare with ears cocked when owners are asleep, cry for no apparent
+      reason meow all night. Plop down in the middle where everybody walks favor
+      packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.
+    </p>
+  </body>
+</html>