mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
85094cbb3a
Added missed docstrings. Updated docstrings to consistent format.
55 lines
1.6 KiB
Python
55 lines
1.6 KiB
Python
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
|
|
|
|
import logging
|
|
from typing import Any, Dict, Iterator, Union
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
from langchain_community.document_loaders.base import BaseBlobParser
|
|
from langchain_community.document_loaders.blob_loaders import Blob
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BS4HTMLParser(BaseBlobParser):
|
|
"""Parse HTML files using `Beautiful Soup`."""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
features: str = "lxml",
|
|
get_text_separator: str = "",
|
|
**kwargs: Any,
|
|
) -> None:
|
|
"""Initialize a bs4 based HTML parser."""
|
|
try:
|
|
import bs4 # noqa:F401
|
|
except ImportError:
|
|
raise ImportError(
|
|
"beautifulsoup4 package not found, please install it with "
|
|
"`pip install beautifulsoup4`"
|
|
)
|
|
|
|
self.bs_kwargs = {"features": features, **kwargs}
|
|
self.get_text_separator = get_text_separator
|
|
|
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
|
"""Load HTML document into document objects."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
with blob.as_bytes_io() as f:
|
|
soup = BeautifulSoup(f, **self.bs_kwargs)
|
|
|
|
text = soup.get_text(self.get_text_separator)
|
|
|
|
if soup.title:
|
|
title = str(soup.title.string)
|
|
else:
|
|
title = ""
|
|
|
|
metadata: Dict[str, Union[str, None]] = {
|
|
"source": blob.source,
|
|
"title": title,
|
|
}
|
|
yield Document(page_content=text, metadata=metadata)
|