diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 1c2066231b..78c0777646 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -17,6 +17,7 @@ from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.gutenberg import GutenbergLoader from langchain.document_loaders.hn import HNLoader from langchain.document_loaders.html import UnstructuredHTMLLoader +from langchain.document_loaders.html_bs import BSHTMLLoader from langchain.document_loaders.ifixit import IFixitLoader from langchain.document_loaders.image import UnstructuredImageLoader from langchain.document_loaders.imsdb import IMSDbLoader @@ -64,6 +65,7 @@ __all__ = [ "ReadTheDocsLoader", "GoogleDriveLoader", "UnstructuredHTMLLoader", + "BSHTMLLoader", "UnstructuredPowerPointLoader", "UnstructuredWordDocumentLoader", "UnstructuredPDFLoader", diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index 7c77daa91b..ecffa85eb5 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -5,10 +5,13 @@ from typing import List, Type, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.html_bs import BSHTMLLoader from langchain.document_loaders.text import TextLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader -FILE_LOADER_TYPE = Union[Type[UnstructuredFileLoader], Type[TextLoader]] +FILE_LOADER_TYPE = Union[ + Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader] +] logger = logging.getLogger(__file__) diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py new file mode 100644 index 0000000000..92802ccb5d --- /dev/null +++ b/langchain/document_loaders/html_bs.py @@ -0,0 +1,36 @@ +"""Loader that uses bs4 to load HTML files, enriching metadata with page title.""" + +import logging +from typing import Dict, List, Union + +from bs4 import BeautifulSoup + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +logger = logging.getLogger(__file__) + + +class BSHTMLLoader(BaseLoader): + """Loader that uses beautiful soup to parse HTML files.""" + + def __init__(self, file_path: str) -> None: + self.file_path = file_path + + def load(self) -> List[Document]: + """Load HTML document into document objects.""" + with open(self.file_path, "r") as f: + soup = BeautifulSoup(f, features="lxml") + + text = soup.get_text() + + if soup.title: + title = str(soup.title.string) + else: + title = "" + + metadata: Dict[str, Union[str, None]] = { + "source": self.file_path, + "title": title, + } + return [Document(page_content=text, metadata=metadata)] diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/integration_tests/document_loaders/test_bshtml.py new file mode 100644 index 0000000000..7e67d6211b --- /dev/null +++ b/tests/integration_tests/document_loaders/test_bshtml.py @@ -0,0 +1,17 @@ +from pathlib import Path + +from langchain.document_loaders.html_bs import BSHTMLLoader + + +def test_bs_html_loader() -> None: + """Test unstructured loader.""" + file_path = Path(__file__).parent.parent / "examples/example.html" + loader = BSHTMLLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + + metadata = docs[0].metadata + + assert metadata["title"] == "Chew dad's slippers" + assert metadata["source"] == str(file_path) diff --git a/tests/integration_tests/examples/example.html b/tests/integration_tests/examples/example.html new file mode 100644 index 0000000000..b9318b7a55 --- /dev/null +++ b/tests/integration_tests/examples/example.html @@ -0,0 +1,25 @@ + + + Chew dad's slippers + + +

+ Instead of drinking water from the cat bowl, make sure to steal water from + the toilet +

+

Chase the red dot

+

+ Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner, + destroy all furniture, especially couch get scared by sudden appearance of + cucumber cat is love, cat is life fat baby cat best buddy little guy for + catch eat throw up catch eat throw up bad birds jump on fridge. Purr like + a car engine oh yes, there is my human woman she does best pats ever that + all i like about her hiss meow . +

+

+ Dead stare with ears cocked when owners are asleep, cry for no apparent + reason meow all night. Plop down in the middle where everybody walks favor + packaging over toy. Sit on the laptop kitty pounce, trip, faceplant. +

+ +