From 1ddd6dbf0b336c93331f8edf88e793a0871e0214 Mon Sep 17 00:00:00 2001 From: Sam Cordner-Matthews <74786574+samuelwcm@users.noreply.github.com> Date: Sat, 1 Apr 2023 20:48:27 +0100 Subject: [PATCH] Add ability to pass kwargs to loader classes in `DirectoryLoader`, add ability to modify encoding and BeautifulSoup behaviour in `BSHTMLLoader` (#2275) Solves #2247. Noted that the only test I added checks for the BeautifulSoup behaviour change. Happy to add a test for `DirectoryLoader` if deemed necessary. --- langchain/document_loaders/directory.py | 6 ++++- langchain/document_loaders/html_bs.py | 17 ++++++++++--- .../document_loaders/test_bshtml.py | 25 +++++++++++++++++++ .../examples/example-utf8.html | 25 +++++++++++++++++++ 4 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 tests/integration_tests/examples/example-utf8.html diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index ecffa85e..370875d7 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -33,13 +33,17 @@ class DirectoryLoader(BaseLoader): silent_errors: bool = False, load_hidden: bool = False, loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, + loader_kwargs: Union[dict, None] = None, recursive: bool = False, ): """Initialize with path to directory and how to glob over it.""" + if loader_kwargs is None: + loader_kwargs = {} self.path = path self.glob = glob self.load_hidden = load_hidden self.loader_cls = loader_cls + self.loader_kwargs = loader_kwargs self.silent_errors = silent_errors self.recursive = recursive @@ -52,7 +56,7 @@ class DirectoryLoader(BaseLoader): if i.is_file(): if _is_visible(i.relative_to(p)) or self.load_hidden: try: - sub_docs = self.loader_cls(str(i)).load() + sub_docs = self.loader_cls(str(i), **self.loader_kwargs).load() docs.extend(sub_docs) except Exception as e: if self.silent_errors: diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py index 568d8054..42f27b9b 100644 --- a/langchain/document_loaders/html_bs.py +++ b/langchain/document_loaders/html_bs.py @@ -12,7 +12,14 @@ logger = logging.getLogger(__file__) class BSHTMLLoader(BaseLoader): """Loader that uses beautiful soup to parse HTML files.""" - def __init__(self, file_path: str) -> None: + def __init__( + self, + file_path: str, + open_encoding: Union[str, None] = None, + bs_kwargs: Union[dict, None] = None, + ) -> None: + """Initialise with path, and optionally, file encoding to use, and any kwargs + to pass to the BeautifulSoup object.""" try: import bs4 # noqa:F401 except ImportError: @@ -21,13 +28,17 @@ class BSHTMLLoader(BaseLoader): ) self.file_path = file_path + self.open_encoding = open_encoding + if bs_kwargs is None: + bs_kwargs = {"features": "lxml"} + self.bs_kwargs = bs_kwargs def load(self) -> List[Document]: from bs4 import BeautifulSoup """Load HTML document into document objects.""" - with open(self.file_path, "r") as f: - soup = BeautifulSoup(f, features="lxml") + with open(self.file_path, "r", encoding=self.open_encoding) as f: + soup = BeautifulSoup(f, **self.bs_kwargs) text = soup.get_text() diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/integration_tests/document_loaders/test_bshtml.py index 7e67d621..2510788e 100644 --- a/tests/integration_tests/document_loaders/test_bshtml.py +++ b/tests/integration_tests/document_loaders/test_bshtml.py @@ -1,5 +1,8 @@ +import sys from pathlib import Path +import pytest + from langchain.document_loaders.html_bs import BSHTMLLoader @@ -15,3 +18,25 @@ def test_bs_html_loader() -> None: assert metadata["title"] == "Chew dad's slippers" assert metadata["source"] == str(file_path) + + +@pytest.mark.skipif( + bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"), + reason="default encoding is utf8", +) +def test_bs_html_loader_non_utf8() -> None: + """Test providing encoding to BSHTMLLoader.""" + file_path = Path(__file__).parent.parent / "examples/example-utf8.html" + + with pytest.raises(UnicodeDecodeError): + BSHTMLLoader(str(file_path)).load() + + loader = BSHTMLLoader(str(file_path), open_encoding="utf8") + docs = loader.load() + + assert len(docs) == 1 + + metadata = docs[0].metadata + + assert metadata["title"] == "Chew dad's slippers" + assert metadata["source"] == str(file_path) diff --git a/tests/integration_tests/examples/example-utf8.html b/tests/integration_tests/examples/example-utf8.html new file mode 100644 index 00000000..f96e20fc --- /dev/null +++ b/tests/integration_tests/examples/example-utf8.html @@ -0,0 +1,25 @@ + + + Chew dad's slippers + + +

+ Instead of drinking water from the cat bowl, make sure to steal water from + the toilet +

+

Chase the red dot

+

+ Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner, + destroy all furniture, especially couch get scared by sudden appearance of + cucumber cat is love, cat is life fat baby cat best buddy little guy for + catch eat throw up catch eat throw up bad birds jump on fridge. Purr like + a car engine oh yes, there is my human woman she does best pats ever that + all i like about her hiss meow . +

+

+ Dead stare with ears cocked when “owners” are asleep, cry for no apparent + reason meow all night. Plop down in the middle where everybody walks favor + packaging over toy. Sit on the laptop kitty pounce, trip, faceplant. +

+ +