Add html parsers (#4874)

# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
2023-05-17 22:39:11 -04:00 · 2023-05-17 22:39:11 -04:00 · 0dc304ca80
commit 0dc304ca80
parent 8e41143bf5
7 changed files with 96 additions and 16 deletions
--- a/langchain/document_loaders/parsers/init.py
+++ b/langchain/document_loaders/parsers/init.py
@ -1,3 +1,4 @@
+from langchain.document_loaders.parsers.html import BS4HTMLParser
 from langchain.document_loaders.parsers.pdf import (
    PDFMinerParser,
    PDFPlumberParser,
@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
 )

 __all__ = [
-    "PyPDFParser",
+    "BS4HTMLParser",
    "PDFMinerParser",
+    "PDFPlumberParser",
    "PyMuPDFParser",
    "PyPDFium2Parser",
-    "PDFPlumberParser",
+    "PyPDFParser",
 ]
--- a/langchain/document_loaders/parsers/html/init.py
+++ b/langchain/document_loaders/parsers/html/init.py
@ -0,0 +1,3 @@
+from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
+
+__all__ = ["BS4HTMLParser"]
--- a/langchain/document_loaders/parsers/html/bs4.py
+++ b/langchain/document_loaders/parsers/html/bs4.py
@ -0,0 +1,53 @@
+"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
+
+import logging
+from typing import Any, Dict, Iterator, Union
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import Blob
+
+logger = logging.getLogger(__name__)
+
+
+class BS4HTMLParser(BaseBlobParser):
+    """Parser that uses beautiful soup to parse HTML files."""
+
+    def __init__(
+        self,
+        *,
+        features: str = "lxml",
+        get_text_separator: str = "",
+        **kwargs: Any,
+    ) -> None:
+        """Initialize a bs4 based HTML parser."""
+        try:
+            import bs4  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "beautifulsoup4 package not found, please install it with "
+                "`pip install beautifulsoup4`"
+            )
+
+        self.bs_kwargs = {"features": features, **kwargs}
+        self.get_text_separator = get_text_separator
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Load HTML document into document objects."""
+        from bs4 import BeautifulSoup
+
+        with blob.as_bytes_io() as f:
+            soup = BeautifulSoup(f, **self.bs_kwargs)
+
+        text = soup.get_text(self.get_text_separator)
+
+        if soup.title:
+            title = str(soup.title.string)
+        else:
+            title = ""
+
+        metadata: Dict[str, Union[str, None]] = {
+            "source": blob.source,
+            "title": title,
+        }
+        yield Document(page_content=text, metadata=metadata)
--- a/tests/integration_tests/document_loaders/parsers/test_public_api.py
+++ b/tests/integration_tests/document_loaders/parsers/test_public_api.py
@ -1,12 +0,0 @@
-from langchain.document_loaders.parsers import __all__
-
-
-def test_parsers_public_api_correct() -> None:
-    """Test public API of parsers for breaking changes."""
-    assert set(__all__) == {
-        "PyPDFParser",
-        "PDFMinerParser",
-        "PyMuPDFParser",
-        "PyPDFium2Parser",
-        "PDFPlumberParser",
-    }
--- a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
+++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@ -0,0 +1,28 @@
+"""Tests for the HTML parsers."""
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.document_loaders.parsers.html import BS4HTMLParser
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_bs_html_loader() -> None:
+    """Test unstructured loader."""
+    file_path = EXAMPLES / "example.html"
+    blob = Blob.from_path(file_path)
+    parser = BS4HTMLParser(get_text_separator="|")
+    docs = list(parser.lazy_parse(blob))
+    assert isinstance(docs, list)
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "Chew dad's slippers"
+    assert metadata["source"] == str(file_path)
+    assert content[:2] == "\n|"
--- a/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/tests/unit_tests/document_loaders/parsers/test_public_api.py
@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
 def test_parsers_public_api_correct() -> None:
    """Test public API of parsers for breaking changes."""
    assert set(__all__) == {
+        "BS4HTMLParser",
        "PyPDFParser",
        "PDFMinerParser",
        "PyMuPDFParser",
--- a/tests/integration_tests/document_loaders/test_bshtml.py
+++ b/tests/integration_tests/document_loaders/test_bshtml.py
@ -5,10 +5,14 @@ import pytest

 from langchain.document_loaders.html_bs import BSHTMLLoader

+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"

+
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader() -> None:
    """Test unstructured loader."""
-    file_path = Path(__file__).parent.parent / "examples/example.html"
+    file_path = EXAMPLES / "example.html"
    loader = BSHTMLLoader(str(file_path), get_text_separator="|")
    docs = loader.load()

@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
    bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
    reason="default encoding is utf8",
 )
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader_non_utf8() -> None:
    """Test providing encoding to BSHTMLLoader."""
-    file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
+    file_path = EXAMPLES / "example-utf8.html"

    with pytest.raises(UnicodeDecodeError):
        BSHTMLLoader(str(file_path)).load()