Harrison/msg files (#2375)

Co-authored-by: Sahil Masand <masand.sahil@gmail.com>
Co-authored-by: Sahil Masand <masands@cbh.com.au>
pull/2391/head
Harrison Chase 1 year ago committed by GitHub
parent 585f60a5aa
commit e90d007db3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,7 +7,15 @@
"source": [ "source": [
"# Email\n", "# Email\n",
"\n", "\n",
"This notebook shows how to load email (`.eml`) files." "This notebook shows how to load email (`.eml`) and Microsoft Outlook (`.msg`) files."
]
},
{
"cell_type": "markdown",
"id": "89caa348",
"metadata": {},
"source": [
"## Using Unstructured"
] ]
}, },
{ {
@ -66,7 +74,7 @@
"id": "8bf50cba", "id": "8bf50cba",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Retain Elements\n", "### Retain Elements\n",
"\n", "\n",
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`." "Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
] ]
@ -112,10 +120,69 @@
"data[0]" "data[0]"
] ]
}, },
{
"cell_type": "markdown",
"id": "6a074515",
"metadata": {},
"source": [
"## Using OutlookMessageLoader"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1e7a8444",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import OutlookMessageLoader"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "77a055e6",
"metadata": {},
"outputs": [],
"source": [
"loader = OutlookMessageLoader('example_data/fake-email.msg')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "789882de",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "46aa0632",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou <brizhou@gmail.com>', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[0]"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "6a074515", "id": "2b223ce2",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []

@ -11,13 +11,18 @@ from langchain.document_loaders.azure_blob_storage_file import (
) )
from langchain.document_loaders.bigquery import BigQueryLoader from langchain.document_loaders.bigquery import BigQueryLoader
from langchain.document_loaders.blackboard import BlackboardLoader from langchain.document_loaders.blackboard import BlackboardLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.college_confidential import (
CollegeConfidentialLoader,
)
from langchain.document_loaders.conllu import CoNLLULoader from langchain.document_loaders.conllu import CoNLLULoader
from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.duckdb_loader import DuckDBLoader
from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.email import (
OutlookMessageLoader,
UnstructuredEmailLoader,
)
from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.epub import UnstructuredEPubLoader
from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.evernote import EverNoteLoader
from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader
@ -61,7 +66,9 @@ from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.url_selenium import SeleniumURLLoader from langchain.document_loaders.url_selenium import SeleniumURLLoader
from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader from langchain.document_loaders.word_document import (
UnstructuredWordDocumentLoader,
)
from langchain.document_loaders.youtube import ( from langchain.document_loaders.youtube import (
GoogleApiClient, GoogleApiClient,
GoogleApiYoutubeLoader, GoogleApiYoutubeLoader,
@ -89,6 +96,7 @@ __all__ = [
"UnstructuredImageLoader", "UnstructuredImageLoader",
"ObsidianLoader", "ObsidianLoader",
"UnstructuredEmailLoader", "UnstructuredEmailLoader",
"OutlookMessageLoader",
"UnstructuredEPubLoader", "UnstructuredEPubLoader",
"UnstructuredMarkdownLoader", "UnstructuredMarkdownLoader",
"RoamLoader", "RoamLoader",

@ -1,6 +1,9 @@
"""Loader that loads email files.""" """Loader that loads email files."""
import os
from typing import List from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
@ -11,3 +14,42 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path) return partition_email(filename=self.file_path)
class OutlookMessageLoader(BaseLoader):
"""
Loader that loads Outlook Message files using extract_msg.
https://github.com/TeamMsgExtractor/msg-extractor
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file" % self.file_path)
try:
import extract_msg # noqa:F401
except ImportError:
raise ImportError(
"extract_msg is not installed. Please install it with "
"`pip install extract_msg`"
)
def load(self) -> List[Document]:
"""Load data into document objects."""
import extract_msg
msg = extract_msg.Message(self.file_path)
return [
Document(
page_content=msg.body,
metadata={
"subject": msg.subject,
"sender": msg.sender,
"date": msg.date,
},
)
]

@ -0,0 +1,20 @@
from pathlib import Path
from langchain.document_loaders import OutlookMessageLoader
def test_outlook_message_loader() -> None:
"""Test OutlookMessageLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.msg"
loader = OutlookMessageLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["subject"] == "Test for TIF files"
assert docs[0].metadata["sender"] == "Brian Zhou <brizhou@gmail.com>"
assert docs[0].metadata["date"] == "Mon, 18 Nov 2013 16:26:24 +0800"
assert docs[0].page_content == (
"This is a test email to experiment with the MS Outlook MSG "
"Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards"
"\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n"
)
Loading…
Cancel
Save