From 87802c86d976c6817ee48cb2a25162d5da109e36 Mon Sep 17 00:00:00 2001
From: Pau Ramon Revilla <pau@factorial.co>
Date: Sun, 25 Jun 2023 22:12:08 +0200
Subject: [PATCH] Added a MHTML document loader (#6311)

MHTML is a very interesting format since it's used both for emails but
also for archived webpages. Some scraping projects want to store pages
in disk to process them later, mhtml is perfect for that use case.

This is heavily inspired from the beautifulsoup html loader, but
extracting the html part from the mhtml file.

---------

Co-authored-by: rlm <pexpresss31@gmail.com>
---
 .../document_loaders/integrations/mhtml.ipynb |  71 ++++++++++++
 langchain/document_loaders/__init__.py        |   2 +
 langchain/document_loaders/mhtml.py           |  69 +++++++++++
 tests/integration_tests/examples/example.mht  | 108 ++++++++++++++++++
 .../unit_tests/document_loaders/test_mhtml.py |  25 ++++
 5 files changed, 275 insertions(+)
 create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/mhtml.ipynb
 create mode 100644 langchain/document_loaders/mhtml.py
 create mode 100644 tests/integration_tests/examples/example.mht
 create mode 100644 tests/unit_tests/document_loaders/test_mhtml.py

diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/mhtml.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/mhtml.ipynb
new file mode 100644
index 0000000000..12ebd2a3e1
--- /dev/null
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/mhtml.ipynb
@@ -0,0 +1,71 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "87067cdf",
+   "metadata": {},
+   "source": [
+    "# mhtml\n",
+    "\n",
+    "MHTML is a is used both for emails but also for archived webpages. MHTML, sometimes referred as MHT, stands for MIME HTML is a single file in which entire webpage is archived. When one saves a webpage as MHTML format, this file extension will contain HTML code, images, audio files, flash animation etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d4c6174",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import MHTMLLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "12dcebc8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "page_content='LangChain\\nLANG CHAIN 🦜️🔗Official Home Page\\xa0\\n\\n\\n\\n\\n\\n\\n\\nIntegrations\\n\\n\\n\\nFeatures\\n\\n\\n\\n\\nBlog\\n\\n\\n\\nConceptual Guide\\n\\n\\n\\n\\nPython Repo\\n\\n\\nJavaScript Repo\\n\\n\\n\\nPython Documentation \\n\\n\\nJavaScript Documentation\\n\\n\\n\\n\\nPython ChatLangChain \\n\\n\\nJavaScript ChatLangChain\\n\\n\\n\\n\\nDiscord \\n\\n\\nTwitter\\n\\n\\n\\n\\nIf you have any comments about our WEB page, you can \\nwrite us at the address shown above.  However, due to \\nthe limited number of personnel in our corporate office, we are unable to \\nprovide a direct response.\\n\\nCopyright © 2023-2023 LangChain Inc.\\n\\n\\n' metadata={'source': '../../../../../../tests/integration_tests/examples/example.mht', 'title': 'LangChain'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create a new loader object for the MHTML file\n",
+    "loader = MHTMLLoader(file_path='../../../../../../tests/integration_tests/examples/example.mht')\n",
+    "\n",
+    "# Load the document from the file\n",
+    "documents = loader.load()\n",
+    "\n",
+    "# Print the documents to see the results\n",
+    "for doc in documents:\n",
+    "    print(doc)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 1503734f77..57e450c884 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -68,6 +68,7 @@ from langchain.document_loaders.mastodon import MastodonTootsLoader
 from langchain.document_loaders.max_compute import MaxComputeLoader
 from langchain.document_loaders.mediawikidump import MWDumpLoader
 from langchain.document_loaders.merge import MergedDataLoader
+from langchain.document_loaders.mhtml import MHTMLLoader
 from langchain.document_loaders.modern_treasury import ModernTreasuryLoader
 from langchain.document_loaders.notebook import NotebookLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
@@ -205,6 +206,7 @@ __all__ = [
     "MathpixPDFLoader",
     "MaxComputeLoader",
     "MergedDataLoader",
+    "MHTMLLoader",
     "ModernTreasuryLoader",
     "NotebookLoader",
     "NotionDBLoader",
diff --git a/langchain/document_loaders/mhtml.py b/langchain/document_loaders/mhtml.py
new file mode 100644
index 0000000000..27d3eceb12
--- /dev/null
+++ b/langchain/document_loaders/mhtml.py
@@ -0,0 +1,69 @@
+"""Loader to load MHTML files, enriching metadata with page title."""
+
+import email
+import logging
+from typing import Dict, List, Union
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__name__)
+
+
+class MHTMLLoader(BaseLoader):
+    """Loader that uses beautiful soup to parse HTML files."""
+
+    def __init__(
+        self,
+        file_path: str,
+        open_encoding: Union[str, None] = None,
+        bs_kwargs: Union[dict, None] = None,
+        get_text_separator: str = "",
+    ) -> None:
+        """Initialise with path, and optionally, file encoding to use, and any kwargs
+        to pass to the BeautifulSoup object."""
+        try:
+            import bs4  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "beautifulsoup4 package not found, please install it with "
+                "`pip install beautifulsoup4`"
+            )
+
+        self.file_path = file_path
+        self.open_encoding = open_encoding
+        if bs_kwargs is None:
+            bs_kwargs = {"features": "lxml"}
+        self.bs_kwargs = bs_kwargs
+        self.get_text_separator = get_text_separator
+
+    def load(self) -> List[Document]:
+        from bs4 import BeautifulSoup
+
+        """Load MHTML document into document objects."""
+
+        with open(self.file_path, "r", encoding=self.open_encoding) as f:
+            message = email.message_from_string(f.read())
+            parts = message.get_payload()
+
+            if type(parts) is not list:
+                parts = [message]
+
+            for part in parts:
+                if part.get_content_type() == "text/html":
+                    html = part.get_payload(decode=True).decode()
+
+                    soup = BeautifulSoup(html, **self.bs_kwargs)
+                    text = soup.get_text(self.get_text_separator)
+
+                    if soup.title:
+                        title = str(soup.title.string)
+                    else:
+                        title = ""
+
+                    metadata: Dict[str, Union[str, None]] = {
+                        "source": self.file_path,
+                        "title": title,
+                    }
+                    return [Document(page_content=text, metadata=metadata)]
+        return []
diff --git a/tests/integration_tests/examples/example.mht b/tests/integration_tests/examples/example.mht
new file mode 100644
index 0000000000..44a45ea020
--- /dev/null
+++ b/tests/integration_tests/examples/example.mht
@@ -0,0 +1,108 @@
+From: <Saved by Blink>
+Snapshot-Content-Location: https://langchain.com/
+Subject: 
+Date: Fri, 16 Jun 2023 19:32:59 -0000
+MIME-Version: 1.0
+Content-Type: multipart/related;
+	type="text/html";
+	boundary="----MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt----"
+
+
+------MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt----
+Content-Type: text/html
+Content-ID: <frame-2F1DB31BBD26C55A7F1EEC7561350515@mhtml.blink>
+Content-Transfer-Encoding: quoted-printable
+Content-Location: https://langchain.com/
+
+<html><head><title>LangChain</title><meta http-equiv=3D"Content-Type" content=3D"text/html; charset=
+=3DUTF-8"><link rel=3D"stylesheet" type=3D"text/css" href=3D"cid:css-c9ac93=
+be-2ab2-46d8-8690-80da3a6d1832@mhtml.blink" /></head><body data-new-gr-c-s-=
+check-loaded=3D"14.1112.0" data-gr-ext-installed=3D""><p align=3D"center">
+	<b><font size=3D"6">L</font><font size=3D"4">ANG </font><font size=3D"6">C=
+</font><font size=3D"4">HAIN </font><font size=3D"2">=F0=9F=A6=9C=EF=B8=8F=
+=F0=9F=94=97</font><br>Official Home Page</b><font size=3D"1">&nbsp;</font>=
+</p>
+
+<hr>
+<center>
+<table border=3D"0" cellspacing=3D"0" width=3D"90%">
+  <tbody>
+  <tr>
+    <td height=3D"55" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://langchain.com/integrations.html">Integration=
+s</a>=20
+    </li></ul></td>
+   <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://langchain.com/features.html">Features</a>=20
+        </li></ul></td></tr>
+    <tr>
+    <td height=3D"55" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://blog.langchain.dev/">Blog</a>=20
+    </li></ul></td>
+   <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://docs.langchain.com/docs/">Conceptual Guide</=
+a>=20
+        </li></ul></td></tr>
+
+  <tr>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://github.com/hwchase17/langchain">Python Repo<=
+/a></li></ul></td>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+		  <ul>
+        <li><a href=3D"https://github.com/hwchase17/langchainjs">JavaScript=
+ Repo</a></li></ul></td></tr>
+ =20
+=09
+  <tr>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://python.langchain.com/en/latest/">Python Docu=
+mentation</a> </li></ul></td>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+         <li><a href=3D"https://js.langchain.com/docs/">JavaScript Document=
+ation</a>
+					</li></ul></td></tr>
+  <tr>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://github.com/hwchase17/chat-langchain">Python =
+ChatLangChain</a> </li></ul></td>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+         <li><a href=3D"https://github.com/sullivan-sean/chat-langchainjs">=
+JavaScript ChatLangChain</a>
+					</li></ul></td></tr>
+  <tr>
+    <td height=3D"45" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://discord.gg/6adMQxSpJS">Discord</a> </li></ul=
+></td>
+    <td height=3D"55" valign=3D"top" width=3D"50%">
+      <ul>
+        <li><a href=3D"https://twitter.com/langchainai">Twitter</a>
+					</li></ul></td></tr>
+			=09
+
+
+</tbody></table></center>
+<hr>
+<font size=3D"2">
+<p>If you have any comments about our WEB page, you can=20
+write us at the address shown above.  However, due to=20
+the limited number of personnel in our corporate office, we are unable to=
+=20
+provide a direct response.</p></font>
+<hr>
+<p align=3D"left"><font size=3D"2">Copyright =C2=A9 2023-2023<b> LangChain =
+Inc.</b></font><font size=3D"2">=20
+</font></p>
+</body></html>
+
+------MultipartBoundary--dYaUgeoeP18TqraaeOwkeZyu1vI09OtkFwH2rcnJMt------
diff --git a/tests/unit_tests/document_loaders/test_mhtml.py b/tests/unit_tests/document_loaders/test_mhtml.py
new file mode 100644
index 0000000000..2ab36defbb
--- /dev/null
+++ b/tests/unit_tests/document_loaders/test_mhtml.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.mhtml import MHTMLLoader
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_mhtml_loader() -> None:
+    """Test mhtml loader."""
+    file_path = EXAMPLES / "example.mht"
+    loader = MHTMLLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "LangChain"
+    assert metadata["source"] == str(file_path)
+    assert "LANG CHAIN 🦜️🔗Official Home Page" in content