Add ability to pass kwargs to loader classes in `DirectoryLoader`, add ability to modify encoding and BeautifulSoup behaviour in `BSHTMLLoader` (#2275)

Solves #2247. Noted that the only test I added checks for the
BeautifulSoup behaviour change. Happy to add a test for
`DirectoryLoader` if deemed necessary.
doc
Sam Cordner-Matthews 1 year ago committed by GitHub
parent 2d0ff1a06d
commit 1ddd6dbf0b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -33,13 +33,17 @@ class DirectoryLoader(BaseLoader):
silent_errors: bool = False,
load_hidden: bool = False,
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
loader_kwargs: Union[dict, None] = None,
recursive: bool = False,
):
"""Initialize with path to directory and how to glob over it."""
if loader_kwargs is None:
loader_kwargs = {}
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.loader_cls = loader_cls
self.loader_kwargs = loader_kwargs
self.silent_errors = silent_errors
self.recursive = recursive
@ -52,7 +56,7 @@ class DirectoryLoader(BaseLoader):
if i.is_file():
if _is_visible(i.relative_to(p)) or self.load_hidden:
try:
sub_docs = self.loader_cls(str(i)).load()
sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load()
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:

@ -12,7 +12,14 @@ logger = logging.getLogger(__file__)
class BSHTMLLoader(BaseLoader):
"""Loader that uses beautiful soup to parse HTML files."""
def __init__(self, file_path: str) -> None:
def __init__(
self,
file_path: str,
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object."""
try:
import bs4 # noqa:F401
except ImportError:
@ -21,13 +28,17 @@ class BSHTMLLoader(BaseLoader):
)
self.file_path = file_path
self.open_encoding = open_encoding
if bs_kwargs is None:
bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs
def load(self) -> List[Document]:
from bs4 import BeautifulSoup
"""Load HTML document into document objects."""
with open(self.file_path, "r") as f:
soup = BeautifulSoup(f, features="lxml")
with open(self.file_path, "r", encoding=self.open_encoding) as f:
soup = BeautifulSoup(f, **self.bs_kwargs)
text = soup.get_text()

@ -1,5 +1,8 @@
import sys
from pathlib import Path
import pytest
from langchain.document_loaders.html_bs import BSHTMLLoader
@ -15,3 +18,25 @@ def test_bs_html_loader() -> None:
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
@pytest.mark.skipif(
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
reason="default encoding is utf8",
)
def test_bs_html_loader_non_utf8() -> None:
"""Test providing encoding to BSHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
with pytest.raises(UnicodeDecodeError):
BSHTMLLoader(str(file_path)).load()
loader = BSHTMLLoader(str(file_path), open_encoding="utf8")
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)

@ -0,0 +1,25 @@
<html>
<head>
<title>Chew dad's slippers</title>
</head>
<body>
<h1>
Instead of drinking water from the cat bowl, make sure to steal water from
the toilet
</h1>
<h2>Chase the red dot</h2>
<p>
Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,
destroy all furniture, especially couch get scared by sudden appearance of
cucumber cat is love, cat is life fat baby cat best buddy little guy for
catch eat throw up catch eat throw up bad birds jump on fridge. Purr like
a car engine oh yes, there is my human woman she does best pats ever that
all i like about her hiss meow .
</p>
<p>
Dead stare with ears cocked when “owners” are asleep, cry for no apparent
reason meow all night. Plop down in the middle where everybody walks favor
packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.
</p>
</body>
</html>
Loading…
Cancel
Save