forked from Archives/langchain
Add html parsers (#4874)
# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
This commit is contained in:
parent
8e41143bf5
commit
0dc304ca80
@ -1,3 +1,4 @@
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
from langchain.document_loaders.parsers.pdf import (
|
||||
PDFMinerParser,
|
||||
PDFPlumberParser,
|
||||
@ -7,9 +8,10 @@ from langchain.document_loaders.parsers.pdf import (
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"PyPDFParser",
|
||||
"BS4HTMLParser",
|
||||
"PDFMinerParser",
|
||||
"PDFPlumberParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
"PyPDFParser",
|
||||
]
|
||||
|
3
langchain/document_loaders/parsers/html/__init__.py
Normal file
3
langchain/document_loaders/parsers/html/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
|
||||
|
||||
__all__ = ["BS4HTMLParser"]
|
53
langchain/document_loaders/parsers/html/bs4.py
Normal file
53
langchain/document_loaders/parsers/html/bs4.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Iterator, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseBlobParser
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BS4HTMLParser(BaseBlobParser):
|
||||
"""Parser that uses beautiful soup to parse HTML files."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
features: str = "lxml",
|
||||
get_text_separator: str = "",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize a bs4 based HTML parser."""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"beautifulsoup4 package not found, please install it with "
|
||||
"`pip install beautifulsoup4`"
|
||||
)
|
||||
|
||||
self.bs_kwargs = {"features": features, **kwargs}
|
||||
self.get_text_separator = get_text_separator
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Load HTML document into document objects."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with blob.as_bytes_io() as f:
|
||||
soup = BeautifulSoup(f, **self.bs_kwargs)
|
||||
|
||||
text = soup.get_text(self.get_text_separator)
|
||||
|
||||
if soup.title:
|
||||
title = str(soup.title.string)
|
||||
else:
|
||||
title = ""
|
||||
|
||||
metadata: Dict[str, Union[str, None]] = {
|
||||
"source": blob.source,
|
||||
"title": title,
|
||||
}
|
||||
yield Document(page_content=text, metadata=metadata)
|
@ -1,12 +0,0 @@
|
||||
from langchain.document_loaders.parsers import __all__
|
||||
|
||||
|
||||
def test_parsers_public_api_correct() -> None:
|
||||
"""Test public API of parsers for breaking changes."""
|
||||
assert set(__all__) == {
|
||||
"PyPDFParser",
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
"PyPDFium2Parser",
|
||||
"PDFPlumberParser",
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
"""Tests for the HTML parsers."""
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain.document_loaders.blob_loaders import Blob
|
||||
from langchain.document_loaders.parsers.html import BS4HTMLParser
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4", "lxml")
|
||||
def test_bs_html_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = EXAMPLES / "example.html"
|
||||
blob = Blob.from_path(file_path)
|
||||
parser = BS4HTMLParser(get_text_separator="|")
|
||||
docs = list(parser.lazy_parse(blob))
|
||||
assert isinstance(docs, list)
|
||||
assert len(docs) == 1
|
||||
|
||||
metadata = docs[0].metadata
|
||||
content = docs[0].page_content
|
||||
|
||||
assert metadata["title"] == "Chew dad's slippers"
|
||||
assert metadata["source"] == str(file_path)
|
||||
assert content[:2] == "\n|"
|
@ -4,6 +4,7 @@ from langchain.document_loaders.parsers import __all__
|
||||
def test_parsers_public_api_correct() -> None:
|
||||
"""Test public API of parsers for breaking changes."""
|
||||
assert set(__all__) == {
|
||||
"BS4HTMLParser",
|
||||
"PyPDFParser",
|
||||
"PDFMinerParser",
|
||||
"PyMuPDFParser",
|
||||
|
@ -5,10 +5,14 @@ import pytest
|
||||
|
||||
from langchain.document_loaders.html_bs import BSHTMLLoader
|
||||
|
||||
HERE = Path(__file__).parent
|
||||
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
|
||||
|
||||
|
||||
@pytest.mark.requires("bs4", "lxml")
|
||||
def test_bs_html_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/example.html"
|
||||
file_path = EXAMPLES / "example.html"
|
||||
loader = BSHTMLLoader(str(file_path), get_text_separator="|")
|
||||
docs = loader.load()
|
||||
|
||||
@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
|
||||
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
|
||||
reason="default encoding is utf8",
|
||||
)
|
||||
@pytest.mark.requires("bs4", "lxml")
|
||||
def test_bs_html_loader_non_utf8() -> None:
|
||||
"""Test providing encoding to BSHTMLLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
|
||||
file_path = EXAMPLES / "example-utf8.html"
|
||||
|
||||
with pytest.raises(UnicodeDecodeError):
|
||||
BSHTMLLoader(str(file_path)).load()
|
Loading…
Reference in New Issue
Block a user