Add HTML document_loader that includes page title metadata (#1720)

This `BSHTMLLoader` document_loader loads an HTML document, extracts
text and adds the page title to the returned Document's metadata. The
loader uses the already installed bs4 package to extract both text
content and the page title.

Included in this PR is an example HTML file and an integration test that
tests against this file.

---------

Co-authored-by: Daniel Chalef <daniel.chalef@private.org>
tool-patch
Daniel Chalef 1 year ago committed by GitHub
parent 40e9488055
commit b157e0c1c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -17,6 +17,7 @@ from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.gutenberg import GutenbergLoader
from langchain.document_loaders.hn import HNLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.ifixit import IFixitLoader
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders.imsdb import IMSDbLoader
@ -64,6 +65,7 @@ __all__ = [
"ReadTheDocsLoader",
"GoogleDriveLoader",
"UnstructuredHTMLLoader",
"BSHTMLLoader",
"UnstructuredPowerPointLoader",
"UnstructuredWordDocumentLoader",
"UnstructuredPDFLoader",

@ -5,10 +5,13 @@ from typing import List, Type, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.html_bs import BSHTMLLoader
from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
FILE_LOADER_TYPE = Union[Type[UnstructuredFileLoader], Type[TextLoader]]
FILE_LOADER_TYPE = Union[
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
logger = logging.getLogger(__file__)

@ -0,0 +1,36 @@
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
import logging
from typing import Dict, List, Union
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__file__)
class BSHTMLLoader(BaseLoader):
"""Loader that uses beautiful soup to parse HTML files."""
def __init__(self, file_path: str) -> None:
self.file_path = file_path
def load(self) -> List[Document]:
"""Load HTML document into document objects."""
with open(self.file_path, "r") as f:
soup = BeautifulSoup(f, features="lxml")
text = soup.get_text()
if soup.title:
title = str(soup.title.string)
else:
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"title": title,
}
return [Document(page_content=text, metadata=metadata)]

@ -0,0 +1,17 @@
from pathlib import Path
from langchain.document_loaders.html_bs import BSHTMLLoader
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.html"
loader = BSHTMLLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)

@ -0,0 +1,25 @@
<html>
<head>
<title>Chew dad's slippers</title>
</head>
<body>
<h1>
Instead of drinking water from the cat bowl, make sure to steal water from
the toilet
</h1>
<h2>Chase the red dot</h2>
<p>
Munch, munch, chomp, chomp hate dogs. Spill litter box, scratch at owner,
destroy all furniture, especially couch get scared by sudden appearance of
cucumber cat is love, cat is life fat baby cat best buddy little guy for
catch eat throw up catch eat throw up bad birds jump on fridge. Purr like
a car engine oh yes, there is my human woman she does best pats ever that
all i like about her hiss meow .
</p>
<p>
Dead stare with ears cocked when owners are asleep, cry for no apparent
reason meow all night. Plop down in the middle where everybody walks favor
packaging over toy. Sit on the laptop kitty pounce, trip, faceplant.
</p>
</body>
</html>
Loading…
Cancel
Save