forked from Archives/langchain
Add html parsers (#4874)
# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
This commit is contained in:
parent
8e41143bf5
commit
0dc304ca80
@ -1,3 +1,4 @@
|
|||||||
|
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||||
from langchain.document_loaders.parsers.pdf import (
|
from langchain.document_loaders.parsers.pdf import (
|
||||||
PDFMinerParser,
|
PDFMinerParser,
|
||||||
PDFPlumberParser,
|
PDFPlumberParser,
|
||||||
@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"PyPDFParser",
|
"BS4HTMLParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
|
"PDFPlumberParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
"PyPDFium2Parser",
|
"PyPDFium2Parser",
|
||||||
"PDFPlumberParser",
|
"PyPDFParser",
|
||||||
]
|
]
|
||||||
|
3
langchain/document_loaders/parsers/html/__init__.py
Normal file
3
langchain/document_loaders/parsers/html/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
|
||||||
|
|
||||||
|
__all__ = ["BS4HTMLParser"]
|
53
langchain/document_loaders/parsers/html/bs4.py
Normal file
53
langchain/document_loaders/parsers/html/bs4.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any, Dict, Iterator, Union
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseBlobParser
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class BS4HTMLParser(BaseBlobParser):
|
||||||
|
"""Parser that uses beautiful soup to parse HTML files."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
features: str = "lxml",
|
||||||
|
get_text_separator: str = "",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize a bs4 based HTML parser."""
|
||||||
|
try:
|
||||||
|
import bs4 # noqa:F401
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"beautifulsoup4 package not found, please install it with "
|
||||||
|
"`pip install beautifulsoup4`"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.bs_kwargs = {"features": features, **kwargs}
|
||||||
|
self.get_text_separator = get_text_separator
|
||||||
|
|
||||||
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Load HTML document into document objects."""
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
with blob.as_bytes_io() as f:
|
||||||
|
soup = BeautifulSoup(f, **self.bs_kwargs)
|
||||||
|
|
||||||
|
text = soup.get_text(self.get_text_separator)
|
||||||
|
|
||||||
|
if soup.title:
|
||||||
|
title = str(soup.title.string)
|
||||||
|
else:
|
||||||
|
title = ""
|
||||||
|
|
||||||
|
metadata: Dict[str, Union[str, None]] = {
|
||||||
|
"source": blob.source,
|
||||||
|
"title": title,
|
||||||
|
}
|
||||||
|
yield Document(page_content=text, metadata=metadata)
|
@ -1,12 +0,0 @@
|
|||||||
from langchain.document_loaders.parsers import __all__
|
|
||||||
|
|
||||||
|
|
||||||
def test_parsers_public_api_correct() -> None:
|
|
||||||
"""Test public API of parsers for breaking changes."""
|
|
||||||
assert set(__all__) == {
|
|
||||||
"PyPDFParser",
|
|
||||||
"PDFMinerParser",
|
|
||||||
"PyMuPDFParser",
|
|
||||||
"PyPDFium2Parser",
|
|
||||||
"PDFPlumberParser",
|
|
||||||
}
|
|
@ -0,0 +1,28 @@
|
|||||||
|
"""Tests for the HTML parsers."""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.document_loaders.blob_loaders import Blob
|
||||||
|
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||||
|
|
||||||
|
HERE = Path(__file__).parent
|
||||||
|
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4", "lxml")
|
||||||
|
def test_bs_html_loader() -> None:
|
||||||
|
"""Test unstructured loader."""
|
||||||
|
file_path = EXAMPLES / "example.html"
|
||||||
|
blob = Blob.from_path(file_path)
|
||||||
|
parser = BS4HTMLParser(get_text_separator="|")
|
||||||
|
docs = list(parser.lazy_parse(blob))
|
||||||
|
assert isinstance(docs, list)
|
||||||
|
assert len(docs) == 1
|
||||||
|
|
||||||
|
metadata = docs[0].metadata
|
||||||
|
content = docs[0].page_content
|
||||||
|
|
||||||
|
assert metadata["title"] == "Chew dad's slippers"
|
||||||
|
assert metadata["source"] == str(file_path)
|
||||||
|
assert content[:2] == "\n|"
|
@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
|
|||||||
def test_parsers_public_api_correct() -> None:
|
def test_parsers_public_api_correct() -> None:
|
||||||
"""Test public API of parsers for breaking changes."""
|
"""Test public API of parsers for breaking changes."""
|
||||||
assert set(__all__) == {
|
assert set(__all__) == {
|
||||||
|
"BS4HTMLParser",
|
||||||
"PyPDFParser",
|
"PyPDFParser",
|
||||||
"PDFMinerParser",
|
"PDFMinerParser",
|
||||||
"PyMuPDFParser",
|
"PyMuPDFParser",
|
||||||
|
@ -5,10 +5,14 @@ import pytest
|
|||||||
|
|
||||||
from langchain.document_loaders.html_bs import BSHTMLLoader
|
from langchain.document_loaders.html_bs import BSHTMLLoader
|
||||||
|
|
||||||
|
HERE = Path(__file__).parent
|
||||||
|
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("bs4", "lxml")
|
||||||
def test_bs_html_loader() -> None:
|
def test_bs_html_loader() -> None:
|
||||||
"""Test unstructured loader."""
|
"""Test unstructured loader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/example.html"
|
file_path = EXAMPLES / "example.html"
|
||||||
loader = BSHTMLLoader(str(file_path), get_text_separator="|")
|
loader = BSHTMLLoader(str(file_path), get_text_separator="|")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
|
|||||||
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
|
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
|
||||||
reason="default encoding is utf8",
|
reason="default encoding is utf8",
|
||||||
)
|
)
|
||||||
|
@pytest.mark.requires("bs4", "lxml")
|
||||||
def test_bs_html_loader_non_utf8() -> None:
|
def test_bs_html_loader_non_utf8() -> None:
|
||||||
"""Test providing encoding to BSHTMLLoader."""
|
"""Test providing encoding to BSHTMLLoader."""
|
||||||
file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
|
file_path = EXAMPLES / "example-utf8.html"
|
||||||
|
|
||||||
with pytest.raises(UnicodeDecodeError):
|
with pytest.raises(UnicodeDecodeError):
|
||||||
BSHTMLLoader(str(file_path)).load()
|
BSHTMLLoader(str(file_path)).load()
|
Loading…
Reference in New Issue
Block a user