Add html parsers (#4874)

# Add bs4 html parser

* Some minor refactors
* Extract the bs4 html parsing code from the bs html loader
* Move some tests from integration tests to unit tests
This commit is contained in:
Eugene Yurtsev 2023-05-17 22:39:11 -04:00 committed by GitHub
parent 8e41143bf5
commit 0dc304ca80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 96 additions and 16 deletions

View File

@ -1,3 +1,4 @@
from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.pdf import ( from langchain.document_loaders.parsers.pdf import (
PDFMinerParser, PDFMinerParser,
PDFPlumberParser, PDFPlumberParser,
@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
) )
__all__ = [ __all__ = [
"PyPDFParser", "BS4HTMLParser",
"PDFMinerParser", "PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser", "PyMuPDFParser",
"PyPDFium2Parser", "PyPDFium2Parser",
"PDFPlumberParser", "PyPDFParser",
] ]

View File

@ -0,0 +1,3 @@
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
__all__ = ["BS4HTMLParser"]

View File

@ -0,0 +1,53 @@
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
import logging
from typing import Any, Dict, Iterator, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
logger = logging.getLogger(__name__)
class BS4HTMLParser(BaseBlobParser):
"""Parser that uses beautiful soup to parse HTML files."""
def __init__(
self,
*,
features: str = "lxml",
get_text_separator: str = "",
**kwargs: Any,
) -> None:
"""Initialize a bs4 based HTML parser."""
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
self.bs_kwargs = {"features": features, **kwargs}
self.get_text_separator = get_text_separator
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Load HTML document into document objects."""
from bs4 import BeautifulSoup
with blob.as_bytes_io() as f:
soup = BeautifulSoup(f, **self.bs_kwargs)
text = soup.get_text(self.get_text_separator)
if soup.title:
title = str(soup.title.string)
else:
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": blob.source,
"title": title,
}
yield Document(page_content=text, metadata=metadata)

View File

@ -1,12 +0,0 @@
from langchain.document_loaders.parsers import __all__
def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"PyPDFParser",
"PDFMinerParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
}

View File

@ -0,0 +1,28 @@
"""Tests for the HTML parsers."""
from pathlib import Path
import pytest
from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.html import BS4HTMLParser
HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = EXAMPLES / "example.html"
blob = Blob.from_path(file_path)
parser = BS4HTMLParser(get_text_separator="|")
docs = list(parser.lazy_parse(blob))
assert isinstance(docs, list)
assert len(docs) == 1
metadata = docs[0].metadata
content = docs[0].page_content
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
assert content[:2] == "\n|"

View File

@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
def test_parsers_public_api_correct() -> None: def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes.""" """Test public API of parsers for breaking changes."""
assert set(__all__) == { assert set(__all__) == {
"BS4HTMLParser",
"PyPDFParser", "PyPDFParser",
"PDFMinerParser", "PDFMinerParser",
"PyMuPDFParser", "PyMuPDFParser",

View File

@ -5,10 +5,14 @@ import pytest
from langchain.document_loaders.html_bs import BSHTMLLoader from langchain.document_loaders.html_bs import BSHTMLLoader
HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader() -> None: def test_bs_html_loader() -> None:
"""Test unstructured loader.""" """Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.html" file_path = EXAMPLES / "example.html"
loader = BSHTMLLoader(str(file_path), get_text_separator="|") loader = BSHTMLLoader(str(file_path), get_text_separator="|")
docs = loader.load() docs = loader.load()
@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"), bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
reason="default encoding is utf8", reason="default encoding is utf8",
) )
@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader_non_utf8() -> None: def test_bs_html_loader_non_utf8() -> None:
"""Test providing encoding to BSHTMLLoader.""" """Test providing encoding to BSHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/example-utf8.html" file_path = EXAMPLES / "example-utf8.html"
with pytest.raises(UnicodeDecodeError): with pytest.raises(UnicodeDecodeError):
BSHTMLLoader(str(file_path)).load() BSHTMLLoader(str(file_path)).load()