Added a MHTML document loader (#6311)

MHTML is a very interesting format since it's used both for emails but
also for archived webpages. Some scraping projects want to store pages
in disk to process them later, mhtml is perfect for that use case.

This is heavily inspired from the beautifulsoup html loader, but
extracting the html part from the mhtml file.

---------

Co-authored-by: rlm <pexpresss31@gmail.com>
This commit is contained in:
Pau Ramon Revilla 2023-06-25 22:12:08 +02:00 committed by GitHub
parent 05eec99269
commit 87802c86d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 275 additions and 0 deletions

View File

@ -0,0 +1,71 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "87067cdf",
"metadata": {},
"source": [
"# mhtml\n",
"\n",
"MHTML is a is used both for emails but also for archived webpages. MHTML, sometimes referred as MHT, stands for MIME HTML is a single file in which entire webpage is archived. When one saves a webpage as MHTML format, this file extension will contain HTML code, images, audio files, flash animation etc."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d4c6174",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import MHTMLLoader"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "12dcebc8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"page_content='LangChain\\nLANG CHAIN 🦜🔗Official Home Page\\xa0\\n\\n\\n\\n\\n\\n\\n\\nIntegrations\\n\\n\\n\\nFeatures\\n\\n\\n\\n\\nBlog\\n\\n\\n\\nConceptual Guide\\n\\n\\n\\n\\nPython Repo\\n\\n\\nJavaScript Repo\\n\\n\\n\\nPython Documentation \\n\\n\\nJavaScript Documentation\\n\\n\\n\\n\\nPython ChatLangChain \\n\\n\\nJavaScript ChatLangChain\\n\\n\\n\\n\\nDiscord \\n\\n\\nTwitter\\n\\n\\n\\n\\nIf you have any comments about our WEB page, you can \\nwrite us at the address shown above. However, due to \\nthe limited number of personnel in our corporate office, we are unable to \\nprovide a direct response.\\n\\nCopyright © 2023-2023 LangChain Inc.\\n\\n\\n' metadata={'source': '../../../../../../tests/integration_tests/examples/example.mht', 'title': 'LangChain'}\n"
]
}
],
"source": [
"# Create a new loader object for the MHTML file\n",
"loader = MHTMLLoader(file_path='../../../../../../tests/integration_tests/examples/example.mht')\n",
"\n",
"# Load the document from the file\n",
"documents = loader.load()\n",
"\n",
"# Print the documents to see the results\n",
"for doc in documents:\n",
" print(doc)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -68,6 +68,7 @@ from langchain.document_loaders.mastodon import MastodonTootsLoader
from langchain.document_loaders.max_compute import MaxComputeLoader
from langchain.document_loaders.mediawikidump import MWDumpLoader
from langchain.document_loaders.merge import MergedDataLoader
from langchain.document_loaders.mhtml import MHTMLLoader
from langchain.document_loaders.modern_treasury import ModernTreasuryLoader
from langchain.document_loaders.notebook import NotebookLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
@ -205,6 +206,7 @@ __all__ = [
"MathpixPDFLoader",
"MaxComputeLoader",
"MergedDataLoader",
"MHTMLLoader",
"ModernTreasuryLoader",
"NotebookLoader",
"NotionDBLoader",

View File

@ -0,0 +1,69 @@
"""Loader to load MHTML files, enriching metadata with page title."""
import email
import logging
from typing import Dict, List, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class MHTMLLoader(BaseLoader):
"""Loader that uses beautiful soup to parse HTML files."""
def __init__(
self,
file_path: str,
open_encoding: Union[str, None] = None,
bs_kwargs: Union[dict, None] = None,
get_text_separator: str = "",
) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object."""
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)
self.file_path = file_path
self.open_encoding = open_encoding
if bs_kwargs is None:
bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator
def load(self) -> List[Document]:
from bs4 import BeautifulSoup
"""Load MHTML document into document objects."""
with open(self.file_path, "r", encoding=self.open_encoding) as f:
message = email.message_from_string(f.read())
parts = message.get_payload()
if type(parts) is not list:
parts = [message]
for part in parts:
if part.get_content_type() == "text/html":
html = part.get_payload(decode=True).decode()
soup = BeautifulSoup(html, **self.bs_kwargs)
text = soup.get_text(self.get_text_separator)
if soup.title:
title = str(soup.title.string)
else:
title = ""
metadata: Dict[str, Union[str, None]] = {
"source": self.file_path,
"title": title,
}
return [Document(page_content=text, metadata=metadata)]
return []

View File

@ -0,0 +1,108 @@
From: <Saved by Blink>
Snapshot-Content-Location: https://langchain.com/
Subject:
Date: Fri, 16 Jun 2023 19:32:59 -0000
MIME-Version: 1.0
Content-Type: multipart/related;
type="text/html";
boundary="----MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt----"
------MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt----
Content-Type: text/html
Content-ID: <frame-2F1DB31BBD26C55A7F1EEC7561350515@mhtml.blink>
Content-Transfer-Encoding: quoted-printable
Content-Location: https://langchain.com/
<html><head><title>LangChain</title><meta http-equiv=3D"Content-Type" content=3D"text/html; charset=
=3DUTF-8"><link rel=3D"stylesheet" type=3D"text/css" href=3D"cid:css-c9ac93=
be-2ab2-46d8-8690-80da3a6d1832@mhtml.blink" /></head><body data-new-gr-c-s-=
check-loaded=3D"14.1112.0" data-gr-ext-installed=3D""><p align=3D"center">
<b><font size=3D"6">L</font><font size=3D"4">ANG </font><font size=3D"6">C=
</font><font size=3D"4">HAIN </font><font size=3D"2">=F0=9F=A6=9C=EF=B8=8F=
=F0=9F=94=97</font><br>Official Home Page</b><font size=3D"1">&nbsp;</font>=
</p>
<hr>
<center>
<table border=3D"0" cellspacing=3D"0" width=3D"90%">
<tbody>
<tr>
<td height=3D"55" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://langchain.com/integrations.html">Integration=
s</a>=20
</li></ul></td>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://langchain.com/features.html">Features</a>=20
</li></ul></td></tr>
<tr>
<td height=3D"55" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://blog.langchain.dev/">Blog</a>=20
</li></ul></td>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://docs.langchain.com/docs/">Conceptual Guide</=
a>=20
</li></ul></td></tr>
<tr>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://github.com/hwchase17/langchain">Python Repo<=
/a></li></ul></td>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://github.com/hwchase17/langchainjs">JavaScript=
Repo</a></li></ul></td></tr>
=20
=09
<tr>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://python.langchain.com/en/latest/">Python Docu=
mentation</a> </li></ul></td>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://js.langchain.com/docs/">JavaScript Document=
ation</a>
</li></ul></td></tr>
<tr>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://github.com/hwchase17/chat-langchain">Python =
ChatLangChain</a> </li></ul></td>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://github.com/sullivan-sean/chat-langchainjs">=
JavaScript ChatLangChain</a>
</li></ul></td></tr>
<tr>
<td height=3D"45" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://discord.gg/6adMQxSpJS">Discord</a> </li></ul=
></td>
<td height=3D"55" valign=3D"top" width=3D"50%">
<ul>
<li><a href=3D"https://twitter.com/langchainai">Twitter</a>
</li></ul></td></tr>
=09
</tbody></table></center>
<hr>
<font size=3D"2">
<p>If you have any comments about our WEB page, you can=20
write us at the address shown above. However, due to=20
the limited number of personnel in our corporate office, we are unable to=
=20
provide a direct response.</p></font>
<hr>
<p align=3D"left"><font size=3D"2">Copyright =C2=A9 2023-2023<b> LangChain =
Inc.</b></font><font size=3D"2">=20
</font></p>
</body></html>
------MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt------

View File

@ -0,0 +1,25 @@
from pathlib import Path
import pytest
from langchain.document_loaders.mhtml import MHTMLLoader
HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
@pytest.mark.requires("bs4", "lxml")
def test_mhtml_loader() -> None:
"""Test mhtml loader."""
file_path = EXAMPLES / "example.mht"
loader = MHTMLLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
metadata = docs[0].metadata
content = docs[0].page_content
assert metadata["title"] == "LangChain"
assert metadata["source"] == str(file_path)
assert "LANG CHAIN 🦜🔗Official Home Page" in content