Add html parsers (#4874)

# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
2023-05-17 22:39:11 -04:00 · 2023-05-17 22:39:11 -04:00 · 0dc304ca80
commit 0dc304ca80
parent 8e41143bf5
7 changed files with 96 additions and 16 deletions
--- a/langchain/document_loaders/parsers/init.py
+++ b/langchain/document_loaders/parsers/init.py
@ -1,3 +1,4 @@
 from langchain.document_loaders.parsers.html import BS4HTMLParser
 from langchain.document_loaders.parsers.pdf import (
    PDFMinerParser,
    PDFPlumberParser,
@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
 )
 __all__ = [
-    "PyPDFParser",
+    "BS4HTMLParser",
    "PDFMinerParser",
    "PDFPlumberParser",
    "PyMuPDFParser",
    "PyPDFium2Parser",
-    "PDFPlumberParser",
+    "PyPDFParser",
 ]
--- a/langchain/document_loaders/parsers/html/init.py
+++ b/langchain/document_loaders/parsers/html/init.py
@ -0,0 +1,3 @@
 from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
 __all__ = ["BS4HTMLParser"]
--- a/langchain/document_loaders/parsers/html/bs4.py
+++ b/langchain/document_loaders/parsers/html/bs4.py
@ -0,0 +1,53 @@
 """Loader that uses bs4 to load HTML files, enriching metadata with page title."""
 import logging
 from typing import Any, Dict, Iterator, Union
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseBlobParser
 from langchain.document_loaders.blob_loaders import Blob
 logger = logging.getLogger(__name__)
 class BS4HTMLParser(BaseBlobParser):
    """Parser that uses beautiful soup to parse HTML files."""
    def __init__(
        self,
        *,
        features: str = "lxml",
        get_text_separator: str = "",
        **kwargs: Any,
    ) -> None:
        """Initialize a bs4 based HTML parser."""
        try:
            import bs4  # noqa:F401
        except ImportError:
            raise ValueError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )
        self.bs_kwargs = {"features": features, **kwargs}
        self.get_text_separator = get_text_separator
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup
        with blob.as_bytes_io() as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)
        text = soup.get_text(self.get_text_separator)
        if soup.title:
            title = str(soup.title.string)
        else:
            title = ""
        metadata: Dict[str, Union[str, None]] = {
            "source": blob.source,
            "title": title,
        }
        yield Document(page_content=text, metadata=metadata)
--- a/tests/integration_tests/document_loaders/parsers/test_public_api.py
+++ b/tests/integration_tests/document_loaders/parsers/test_public_api.py
@ -1,12 +0,0 @@
 from langchain.document_loaders.parsers import __all__
 def test_parsers_public_api_correct() -> None:
    """Test public API of parsers for breaking changes."""
    assert set(__all__) == {
        "PyPDFParser",
        "PDFMinerParser",
        "PyMuPDFParser",
        "PyPDFium2Parser",
        "PDFPlumberParser",
    }
--- a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
+++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@ -0,0 +1,28 @@
 """Tests for the HTML parsers."""
 from pathlib import Path
 import pytest
 from langchain.document_loaders.blob_loaders import Blob
 from langchain.document_loaders.parsers.html import BS4HTMLParser
 HERE = Path(__file__).parent
 EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader() -> None:
    """Test unstructured loader."""
    file_path = EXAMPLES / "example.html"
    blob = Blob.from_path(file_path)
    parser = BS4HTMLParser(get_text_separator="|")
    docs = list(parser.lazy_parse(blob))
    assert isinstance(docs, list)
    assert len(docs) == 1
    metadata = docs[0].metadata
    content = docs[0].page_content
    assert metadata["title"] == "Chew dad's slippers"
    assert metadata["source"] == str(file_path)
    assert content[:2] == "\n|"
--- a/tests/unit_tests/document_loaders/parsers/test_public_api.py
+++ b/tests/unit_tests/document_loaders/parsers/test_public_api.py
@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
 def test_parsers_public_api_correct() -> None:
    """Test public API of parsers for breaking changes."""
    assert set(__all__) == {
        "BS4HTMLParser",
        "PyPDFParser",
        "PDFMinerParser",
        "PyMuPDFParser",
--- a/tests/integration_tests/document_loaders/test_bshtml.py
+++ b/tests/integration_tests/document_loaders/test_bshtml.py
@ -5,10 +5,14 @@ import pytest
 from langchain.document_loaders.html_bs import BSHTMLLoader
 HERE = Path(__file__).parent
 EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader() -> None:
    """Test unstructured loader."""
-    file_path = Path(__file__).parent.parent / "examples/example.html"
+    file_path = EXAMPLES / "example.html"
    loader = BSHTMLLoader(str(file_path), get_text_separator="|")
    docs = loader.load()
@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
    bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
    reason="default encoding is utf8",
 )
@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader_non_utf8() -> None:
    """Test providing encoding to BSHTMLLoader."""
-    file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
+    file_path = EXAMPLES / "example-utf8.html"
    with pytest.raises(UnicodeDecodeError):
        BSHTMLLoader(str(file_path)).load()
		`@ -0,0 +1,3 @@`
							`from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser`

							`__all__ = ["BS4HTMLParser"]`