langchain/libs/community/langchain_community/document_loaders/mhtml.py
mwmajewsk f7a1fd91b8
community: better support of pathlib paths in document loaders (#18396)
So this arose from the
https://github.com/langchain-ai/langchain/pull/18397 problem of document
loaders not supporting `pathlib.Path`.

This pull request provides more uniform support for Path as an argument.
The core ideas for this upgrade: 
- if there is a local file path used as an argument, it should be
supported as `pathlib.Path`
- if there are some external calls that may or may not support Pathlib,
the argument is immidiately converted to `str`
- if there `self.file_path` is used in a way that it allows for it to
stay pathlib without conversion, is is only converted for the metadata.

Twitter handle: https://twitter.com/mwmajewsk
2024-03-26 11:51:52 -04:00

78 lines
2.5 KiB
Python

import email
import logging
from pathlib import Path
from typing import Dict, Iterator, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class MHTMLLoader(BaseLoader):
"""Parse `MHTML` files with `BeautifulSoup`."""
def __init__(
self,
file_path: Union[str, Path],
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
) -> None:
"""initialize with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.
Args:
file_path: Path to file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting the text
from the soup.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
self.file_path = file_path
self.open_encoding = open_encoding
if bs_kwargs is None:
bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator
def lazy_load(self) -> Iterator[Document]:
"""Load MHTML document into document objects."""
from bs4 import BeautifulSoup
with open(self.file_path, "r", encoding=self.open_encoding) as f:
message = email.message_from_string(f.read())
parts = message.get_payload()
if not isinstance(parts, list):
parts = [message]
for part in parts:
if part.get_content_type() == "text/html":
html = part.get_payload(decode=True).decode()
soup = BeautifulSoup(html, **self.bs_kwargs)
text = soup.get_text(self.get_text_separator)
if soup.title:
title = str(soup.title.string)
else:
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": str(self.file_path),
"title": title,
}
yield Document(page_content=text, metadata=metadata)
return