From 0dc304ca80dc920251779949ad1a5e199c8f395b Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 17 May 2023 22:39:11 -0400 Subject: [PATCH] Add html parsers (#4874) # Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests --- .../document_loaders/parsers/__init__.py | 6 ++- .../document_loaders/parsers/html/__init__.py | 3 ++ .../document_loaders/parsers/html/bs4.py | 53 +++++++++++++++++++ .../parsers/test_public_api.py | 12 ----- .../parsers/test_html_parsers.py | 28 ++++++++++ .../parsers/test_public_api.py | 1 + .../document_loaders/test_bshtml.py | 9 +++- 7 files changed, 96 insertions(+), 16 deletions(-) create mode 100644 langchain/document_loaders/parsers/html/__init__.py create mode 100644 langchain/document_loaders/parsers/html/bs4.py delete mode 100644 tests/integration_tests/document_loaders/parsers/test_public_api.py create mode 100644 tests/unit_tests/document_loaders/parsers/test_html_parsers.py rename tests/{integration_tests => unit_tests}/document_loaders/test_bshtml.py (80%) diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py index d1e72bbb..94ac136d 100644 --- a/langchain/document_loaders/parsers/__init__.py +++ b/langchain/document_loaders/parsers/__init__.py @@ -1,3 +1,4 @@ +from langchain.document_loaders.parsers.html import BS4HTMLParser from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, PDFPlumberParser, @@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import ( ) __all__ = [ - "PyPDFParser", + "BS4HTMLParser", "PDFMinerParser", + "PDFPlumberParser", "PyMuPDFParser", "PyPDFium2Parser", - "PDFPlumberParser", + "PyPDFParser", ] diff --git a/langchain/document_loaders/parsers/html/__init__.py b/langchain/document_loaders/parsers/html/__init__.py new file mode 100644 index 00000000..bceacaed --- /dev/null +++ b/langchain/document_loaders/parsers/html/__init__.py @@ -0,0 +1,3 @@ +from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser + +__all__ = ["BS4HTMLParser"] diff --git a/langchain/document_loaders/parsers/html/bs4.py b/langchain/document_loaders/parsers/html/bs4.py new file mode 100644 index 00000000..627bee5f --- /dev/null +++ b/langchain/document_loaders/parsers/html/bs4.py @@ -0,0 +1,53 @@ +"""Loader that uses bs4 to load HTML files, enriching metadata with page title.""" + +import logging +from typing import Any, Dict, Iterator, Union + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob + +logger = logging.getLogger(__name__) + + +class BS4HTMLParser(BaseBlobParser): + """Parser that uses beautiful soup to parse HTML files.""" + + def __init__( + self, + *, + features: str = "lxml", + get_text_separator: str = "", + **kwargs: Any, + ) -> None: + """Initialize a bs4 based HTML parser.""" + try: + import bs4 # noqa:F401 + except ImportError: + raise ValueError( + "beautifulsoup4 package not found, please install it with " + "`pip install beautifulsoup4`" + ) + + self.bs_kwargs = {"features": features, **kwargs} + self.get_text_separator = get_text_separator + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Load HTML document into document objects.""" + from bs4 import BeautifulSoup + + with blob.as_bytes_io() as f: + soup = BeautifulSoup(f, **self.bs_kwargs) + + text = soup.get_text(self.get_text_separator) + + if soup.title: + title = str(soup.title.string) + else: + title = "" + + metadata: Dict[str, Union[str, None]] = { + "source": blob.source, + "title": title, + } + yield Document(page_content=text, metadata=metadata) diff --git a/tests/integration_tests/document_loaders/parsers/test_public_api.py b/tests/integration_tests/document_loaders/parsers/test_public_api.py deleted file mode 100644 index 00da8749..00000000 --- a/tests/integration_tests/document_loaders/parsers/test_public_api.py +++ /dev/null @@ -1,12 +0,0 @@ -from langchain.document_loaders.parsers import __all__ - - -def test_parsers_public_api_correct() -> None: - """Test public API of parsers for breaking changes.""" - assert set(__all__) == { - "PyPDFParser", - "PDFMinerParser", - "PyMuPDFParser", - "PyPDFium2Parser", - "PDFPlumberParser", - } diff --git a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py new file mode 100644 index 00000000..6e6d5587 --- /dev/null +++ b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py @@ -0,0 +1,28 @@ +"""Tests for the HTML parsers.""" +from pathlib import Path + +import pytest + +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.html import BS4HTMLParser + +HERE = Path(__file__).parent +EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples" + + +@pytest.mark.requires("bs4", "lxml") +def test_bs_html_loader() -> None: + """Test unstructured loader.""" + file_path = EXAMPLES / "example.html" + blob = Blob.from_path(file_path) + parser = BS4HTMLParser(get_text_separator="|") + docs = list(parser.lazy_parse(blob)) + assert isinstance(docs, list) + assert len(docs) == 1 + + metadata = docs[0].metadata + content = docs[0].page_content + + assert metadata["title"] == "Chew dad's slippers" + assert metadata["source"] == str(file_path) + assert content[:2] == "\n|" diff --git a/tests/unit_tests/document_loaders/parsers/test_public_api.py b/tests/unit_tests/document_loaders/parsers/test_public_api.py index 00da8749..344b6281 100644 --- a/tests/unit_tests/document_loaders/parsers/test_public_api.py +++ b/tests/unit_tests/document_loaders/parsers/test_public_api.py @@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__ def test_parsers_public_api_correct() -> None: """Test public API of parsers for breaking changes.""" assert set(__all__) == { + "BS4HTMLParser", "PyPDFParser", "PDFMinerParser", "PyMuPDFParser", diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/unit_tests/document_loaders/test_bshtml.py similarity index 80% rename from tests/integration_tests/document_loaders/test_bshtml.py rename to tests/unit_tests/document_loaders/test_bshtml.py index 038371fa..0b458c56 100644 --- a/tests/integration_tests/document_loaders/test_bshtml.py +++ b/tests/unit_tests/document_loaders/test_bshtml.py @@ -5,10 +5,14 @@ import pytest from langchain.document_loaders.html_bs import BSHTMLLoader +HERE = Path(__file__).parent +EXAMPLES = HERE.parent.parent / "integration_tests" / "examples" + +@pytest.mark.requires("bs4", "lxml") def test_bs_html_loader() -> None: """Test unstructured loader.""" - file_path = Path(__file__).parent.parent / "examples/example.html" + file_path = EXAMPLES / "example.html" loader = BSHTMLLoader(str(file_path), get_text_separator="|") docs = loader.load() @@ -26,9 +30,10 @@ def test_bs_html_loader() -> None: bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"), reason="default encoding is utf8", ) +@pytest.mark.requires("bs4", "lxml") def test_bs_html_loader_non_utf8() -> None: """Test providing encoding to BSHTMLLoader.""" - file_path = Path(__file__).parent.parent / "examples/example-utf8.html" + file_path = EXAMPLES / "example-utf8.html" with pytest.raises(UnicodeDecodeError): BSHTMLLoader(str(file_path)).load()