From 0dc304ca80dc920251779949ad1a5e199c8f395b Mon Sep 17 00:00:00 2001
From: Eugene Yurtsev <eyurtsev@gmail.com>
Date: Wed, 17 May 2023 22:39:11 -0400
Subject: [PATCH] Add html parsers (#4874)

# Add bs4 html parser

* Some minor refactors
* Extract the bs4 html parsing code from the bs html loader
* Move some tests from integration tests to unit tests
---
 .../document_loaders/parsers/__init__.py      |  6 ++-
 .../document_loaders/parsers/html/__init__.py |  3 ++
 .../document_loaders/parsers/html/bs4.py      | 53 +++++++++++++++++++
 .../parsers/test_public_api.py                | 12 -----
 .../parsers/test_html_parsers.py              | 28 ++++++++++
 .../parsers/test_public_api.py                |  1 +
 .../document_loaders/test_bshtml.py           |  9 +++-
 7 files changed, 96 insertions(+), 16 deletions(-)
 create mode 100644 langchain/document_loaders/parsers/html/__init__.py
 create mode 100644 langchain/document_loaders/parsers/html/bs4.py
 delete mode 100644 tests/integration_tests/document_loaders/parsers/test_public_api.py
 create mode 100644 tests/unit_tests/document_loaders/parsers/test_html_parsers.py
 rename tests/{integration_tests => unit_tests}/document_loaders/test_bshtml.py (80%)

diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py
index d1e72bbb..94ac136d 100644
--- a/langchain/document_loaders/parsers/__init__.py
+++ b/langchain/document_loaders/parsers/__init__.py
@@ -1,3 +1,4 @@
+from langchain.document_loaders.parsers.html import BS4HTMLParser
 from langchain.document_loaders.parsers.pdf import (
     PDFMinerParser,
     PDFPlumberParser,
@@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
 )
 
 __all__ = [
-    "PyPDFParser",
+    "BS4HTMLParser",
     "PDFMinerParser",
+    "PDFPlumberParser",
     "PyMuPDFParser",
     "PyPDFium2Parser",
-    "PDFPlumberParser",
+    "PyPDFParser",
 ]
diff --git a/langchain/document_loaders/parsers/html/__init__.py b/langchain/document_loaders/parsers/html/__init__.py
new file mode 100644
index 00000000..bceacaed
--- /dev/null
+++ b/langchain/document_loaders/parsers/html/__init__.py
@@ -0,0 +1,3 @@
+from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
+
+__all__ = ["BS4HTMLParser"]
diff --git a/langchain/document_loaders/parsers/html/bs4.py b/langchain/document_loaders/parsers/html/bs4.py
new file mode 100644
index 00000000..627bee5f
--- /dev/null
+++ b/langchain/document_loaders/parsers/html/bs4.py
@@ -0,0 +1,53 @@
+"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
+
+import logging
+from typing import Any, Dict, Iterator, Union
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import Blob
+
+logger = logging.getLogger(__name__)
+
+
+class BS4HTMLParser(BaseBlobParser):
+    """Parser that uses beautiful soup to parse HTML files."""
+
+    def __init__(
+        self,
+        *,
+        features: str = "lxml",
+        get_text_separator: str = "",
+        **kwargs: Any,
+    ) -> None:
+        """Initialize a bs4 based HTML parser."""
+        try:
+            import bs4  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "beautifulsoup4 package not found, please install it with "
+                "`pip install beautifulsoup4`"
+            )
+
+        self.bs_kwargs = {"features": features, **kwargs}
+        self.get_text_separator = get_text_separator
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Load HTML document into document objects."""
+        from bs4 import BeautifulSoup
+
+        with blob.as_bytes_io() as f:
+            soup = BeautifulSoup(f, **self.bs_kwargs)
+
+        text = soup.get_text(self.get_text_separator)
+
+        if soup.title:
+            title = str(soup.title.string)
+        else:
+            title = ""
+
+        metadata: Dict[str, Union[str, None]] = {
+            "source": blob.source,
+            "title": title,
+        }
+        yield Document(page_content=text, metadata=metadata)
diff --git a/tests/integration_tests/document_loaders/parsers/test_public_api.py b/tests/integration_tests/document_loaders/parsers/test_public_api.py
deleted file mode 100644
index 00da8749..00000000
--- a/tests/integration_tests/document_loaders/parsers/test_public_api.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from langchain.document_loaders.parsers import __all__
-
-
-def test_parsers_public_api_correct() -> None:
-    """Test public API of parsers for breaking changes."""
-    assert set(__all__) == {
-        "PyPDFParser",
-        "PDFMinerParser",
-        "PyMuPDFParser",
-        "PyPDFium2Parser",
-        "PDFPlumberParser",
-    }
diff --git a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
new file mode 100644
index 00000000..6e6d5587
--- /dev/null
+++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@@ -0,0 +1,28 @@
+"""Tests for the HTML parsers."""
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.document_loaders.parsers.html import BS4HTMLParser
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_bs_html_loader() -> None:
+    """Test unstructured loader."""
+    file_path = EXAMPLES / "example.html"
+    blob = Blob.from_path(file_path)
+    parser = BS4HTMLParser(get_text_separator="|")
+    docs = list(parser.lazy_parse(blob))
+    assert isinstance(docs, list)
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "Chew dad's slippers"
+    assert metadata["source"] == str(file_path)
+    assert content[:2] == "\n|"
diff --git a/tests/unit_tests/document_loaders/parsers/test_public_api.py b/tests/unit_tests/document_loaders/parsers/test_public_api.py
index 00da8749..344b6281 100644
--- a/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
 def test_parsers_public_api_correct() -> None:
     """Test public API of parsers for breaking changes."""
     assert set(__all__) == {
+        "BS4HTMLParser",
         "PyPDFParser",
         "PDFMinerParser",
         "PyMuPDFParser",
diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/unit_tests/document_loaders/test_bshtml.py
similarity index 80%
rename from tests/integration_tests/document_loaders/test_bshtml.py
rename to tests/unit_tests/document_loaders/test_bshtml.py
index 038371fa..0b458c56 100644
--- a/tests/integration_tests/document_loaders/test_bshtml.py
+++ b/tests/unit_tests/document_loaders/test_bshtml.py
@@ -5,10 +5,14 @@ import pytest
 
 from langchain.document_loaders.html_bs import BSHTMLLoader
 
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
 
+
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader() -> None:
     """Test unstructured loader."""
-    file_path = Path(__file__).parent.parent / "examples/example.html"
+    file_path = EXAMPLES / "example.html"
     loader = BSHTMLLoader(str(file_path), get_text_separator="|")
     docs = loader.load()
 
@@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
     bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
     reason="default encoding is utf8",
 )
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader_non_utf8() -> None:
     """Test providing encoding to BSHTMLLoader."""
-    file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
+    file_path = EXAMPLES / "example-utf8.html"
 
     with pytest.raises(UnicodeDecodeError):
         BSHTMLLoader(str(file_path)).load()