Harrison/msg files (#2375)

Co-authored-by: Sahil Masand <masand.sahil@gmail.com>
Co-authored-by: Sahil Masand <masands@cbh.com.au>
doc
Harrison Chase 1 year ago committed by GitHub
parent 585f60a5aa
commit e90d007db3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,7 +7,15 @@
"source": [
"# Email\n",
"\n",
"This notebook shows how to load email (`.eml`) files."
"This notebook shows how to load email (`.eml`) and Microsoft Outlook (`.msg`) files."
]
},
{
"cell_type": "markdown",
"id": "89caa348",
"metadata": {},
"source": [
"## Using Unstructured"
]
},
{
@ -66,7 +74,7 @@
"id": "8bf50cba",
"metadata": {},
"source": [
"## Retain Elements\n",
"### Retain Elements\n",
"\n",
"Under the hood, Unstructured creates different \"elements\" for different chunks of text. By default we combine those together, but you can easily keep that separation by specifying `mode=\"elements\"`."
]
@ -112,10 +120,69 @@
"data[0]"
]
},
{
"cell_type": "markdown",
"id": "6a074515",
"metadata": {},
"source": [
"## Using OutlookMessageLoader"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1e7a8444",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import OutlookMessageLoader"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "77a055e6",
"metadata": {},
"outputs": [],
"source": [
"loader = OutlookMessageLoader('example_data/fake-email.msg')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "789882de",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "46aa0632",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='This is a test email to experiment with the MS Outlook MSG Extractor\\r\\n\\r\\n\\r\\n-- \\r\\n\\r\\n\\r\\nKind regards\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nBrian Zhou\\r\\n\\r\\n', metadata={'subject': 'Test for TIF files', 'sender': 'Brian Zhou <brizhou@gmail.com>', 'date': 'Mon, 18 Nov 2013 16:26:24 +0800'})"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a074515",
"id": "2b223ce2",
"metadata": {},
"outputs": [],
"source": []

@ -11,13 +11,18 @@ from langchain.document_loaders.azure_blob_storage_file import (
)
from langchain.document_loaders.bigquery import BigQueryLoader
from langchain.document_loaders.blackboard import BlackboardLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.college_confidential import (
CollegeConfidentialLoader,
)
from langchain.document_loaders.conllu import CoNLLULoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.duckdb_loader import DuckDBLoader
from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.email import (
OutlookMessageLoader,
UnstructuredEmailLoader,
)
from langchain.document_loaders.epub import UnstructuredEPubLoader
from langchain.document_loaders.evernote import EverNoteLoader
from langchain.document_loaders.facebook_chat import FacebookChatLoader
@ -61,7 +66,9 @@ from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.url_selenium import SeleniumURLLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader
from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader
from langchain.document_loaders.word_document import (
UnstructuredWordDocumentLoader,
)
from langchain.document_loaders.youtube import (
GoogleApiClient,
GoogleApiYoutubeLoader,
@ -89,6 +96,7 @@ __all__ = [
"UnstructuredImageLoader",
"ObsidianLoader",
"UnstructuredEmailLoader",
"OutlookMessageLoader",
"UnstructuredEPubLoader",
"UnstructuredMarkdownLoader",
"RoamLoader",

@ -1,6 +1,9 @@
"""Loader that loads email files."""
import os
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
@ -11,3 +14,42 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path)
class OutlookMessageLoader(BaseLoader):
"""
Loader that loads Outlook Message files using extract_msg.
https://github.com/TeamMsgExtractor/msg-extractor
"""
def __init__(self, file_path: str):
"""Initialize with file path."""
self.file_path = file_path
if not os.path.isfile(self.file_path):
raise ValueError("File path %s is not a valid file" % self.file_path)
try:
import extract_msg # noqa:F401
except ImportError:
raise ImportError(
"extract_msg is not installed. Please install it with "
"`pip install extract_msg`"
)
def load(self) -> List[Document]:
"""Load data into document objects."""
import extract_msg
msg = extract_msg.Message(self.file_path)
return [
Document(
page_content=msg.body,
metadata={
"subject": msg.subject,
"sender": msg.sender,
"date": msg.date,
},
)
]

@ -0,0 +1,20 @@
from pathlib import Path
from langchain.document_loaders import OutlookMessageLoader
def test_outlook_message_loader() -> None:
"""Test OutlookMessageLoader."""
file_path = Path(__file__).parent.parent / "examples/hello.msg"
loader = OutlookMessageLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["subject"] == "Test for TIF files"
assert docs[0].metadata["sender"] == "Brian Zhou <brizhou@gmail.com>"
assert docs[0].metadata["date"] == "Mon, 18 Nov 2013 16:26:24 +0800"
assert docs[0].page_content == (
"This is a test email to experiment with the MS Outlook MSG "
"Extractor\r\n\r\n\r\n-- \r\n\r\n\r\nKind regards"
"\r\n\r\n\r\n\r\n\r\nBrian Zhou\r\n\r\n"
)
Loading…
Cancel
Save