diff --git a/docs/modules/indexes/document_loaders/examples/email.ipynb b/docs/modules/indexes/document_loaders/examples/email.ipynb index 1ad2c590..b9789fe6 100644 --- a/docs/modules/indexes/document_loaders/examples/email.ipynb +++ b/docs/modules/indexes/document_loaders/examples/email.ipynb @@ -7,7 +7,15 @@ "source": [ "# Email\n", "\n", - "This notebook shows how to load email (`.eml`) files." + "This notebook shows how to load email (`.eml`) and Microsoft Outlook (`.msg`) files." + ] + }, + { + "cell_type": "markdown", + "id": "89caa348", + "metadata": {}, + "source": [ + "## Using Unstructured" ] }, { @@ -66,7 +74,7 @@ "id": "8bf50cba", "metadata": {}, "source": [ - "## Retain Elements\n", + "### Retain Elements\n", "\n", "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." ] @@ -112,10 +120,69 @@ "data[0]" ] }, + { + "cell_type": "markdown", + "id": "6a074515", + "metadata": {}, + "source": [ + "## Using OutlookMessageLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1e7a8444", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import OutlookMessageLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "77a055e6", + "metadata": {}, + "outputs": [], + "source": [ + "loader = OutlookMessageLoader('example_data/fake-email.msg')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "789882de", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "46aa0632", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou ', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "6a074515", + "id": "2b223ce2", "metadata": {}, "outputs": [], "source": [] diff --git a/docs/modules/indexes/document_loaders/examples/example_data/fake-email.msg b/docs/modules/indexes/document_loaders/examples/example_data/fake-email.msg new file mode 100644 index 00000000..0dac0e86 Binary files /dev/null and b/docs/modules/indexes/document_loaders/examples/example_data/fake-email.msg differ diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 31cdfed0..d2ca8d8e 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -11,13 +11,18 @@ from langchain.document_loaders.azure_blob_storage_file import ( ) from langchain.document_loaders.bigquery import BigQueryLoader from langchain.document_loaders.blackboard import BlackboardLoader -from langchain.document_loaders.college_confidential import CollegeConfidentialLoader +from langchain.document_loaders.college_confidential import ( + CollegeConfidentialLoader, +) from langchain.document_loaders.conllu import CoNLLULoader from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader -from langchain.document_loaders.email import UnstructuredEmailLoader +from langchain.document_loaders.email import ( + OutlookMessageLoader, + UnstructuredEmailLoader, +) from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader @@ -61,7 +66,9 @@ from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader -from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader +from langchain.document_loaders.word_document import ( + UnstructuredWordDocumentLoader, +) from langchain.document_loaders.youtube import ( GoogleApiClient, GoogleApiYoutubeLoader, @@ -89,6 +96,7 @@ __all__ = [ "UnstructuredImageLoader", "ObsidianLoader", "UnstructuredEmailLoader", + "OutlookMessageLoader", "UnstructuredEPubLoader", "UnstructuredMarkdownLoader", "RoamLoader", diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index 2c3ecd88..b6a31ed7 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -1,6 +1,9 @@ """Loader that loads email files.""" +import os from typing import List +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader @@ -11,3 +14,42 @@ class UnstructuredEmailLoader(UnstructuredFileLoader): from unstructured.partition.email import partition_email return partition_email(filename=self.file_path) + + +class OutlookMessageLoader(BaseLoader): + """ + Loader that loads Outlook Message files using extract_msg. + https://github.com/TeamMsgExtractor/msg-extractor + """ + + def __init__(self, file_path: str): + """Initialize with file path.""" + + self.file_path = file_path + + if not os.path.isfile(self.file_path): + raise ValueError("File path %s is not a valid file" % self.file_path) + + try: + import extract_msg # noqa:F401 + except ImportError: + raise ImportError( + "extract_msg is not installed. Please install it with " + "`pip install extract_msg`" + ) + + def load(self) -> List[Document]: + """Load data into document objects.""" + import extract_msg + + msg = extract_msg.Message(self.file_path) + return [ + Document( + page_content=msg.body, + metadata={ + "subject": msg.subject, + "sender": msg.sender, + "date": msg.date, + }, + ) + ] diff --git a/tests/integration_tests/document_loaders/test_email.py b/tests/integration_tests/document_loaders/test_email.py new file mode 100644 index 00000000..327bff51 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_email.py @@ -0,0 +1,20 @@ +from pathlib import Path + +from langchain.document_loaders import OutlookMessageLoader + + +def test_outlook_message_loader() -> None: + """Test OutlookMessageLoader.""" + file_path = Path(__file__).parent.parent / "examples/hello.msg" + loader = OutlookMessageLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["subject"] == "Test for TIF files" + assert docs[0].metadata["sender"] == "Brian Zhou " + assert docs[0].metadata["date"] == "Mon, 18 Nov 2013 16:26:24 +0800" + assert docs[0].page_content == ( + "This is a test email to experiment with the MS Outlook MSG " + "Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards" + "\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n" + ) diff --git a/tests/integration_tests/examples/hello.msg b/tests/integration_tests/examples/hello.msg new file mode 100644 index 00000000..0dac0e86 Binary files /dev/null and b/tests/integration_tests/examples/hello.msg differ