From 1140bd79a0ba3ac7ad4a215f8455e9e93b0987a7 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 5 Apr 2023 18:28:14 -0400 Subject: [PATCH] feat: adds support for MSFT Outlook files in `UnstructuredEmailLoader` (#2450) ### Summary Adds support for MSFT Outlook emails saved in `.msg` format to `UnstructuredEmailLoader`. Works if the user has `unstructured>=0.5.8` installed. ### Testing The following tests use the example files under `example-docs` in the Unstructured repo. ```python from langchain.document_loaders import UnstructuredEmailLoader loader = UnstructuredEmailLoader("fake-email.eml") loader.load() loader = UnstructuredEmailLoader("fake-email.msg") loader.load() ``` --- langchain/document_loaders/email.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index b6a31ed7..bd307574 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -4,16 +4,32 @@ from typing import List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader -from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.unstructured import ( + UnstructuredFileLoader, + satisfies_min_unstructured_version, +) class UnstructuredEmailLoader(UnstructuredFileLoader): """Loader that uses unstructured to load email files.""" def _get_elements(self) -> List: - from unstructured.partition.email import partition_email + from unstructured.file_utils.filetype import FileType, detect_filetype - return partition_email(filename=self.file_path) + filetype = detect_filetype(self.file_path) + + if filetype == FileType.EML: + from unstructured.partition.email import partition_email + + return partition_email(filename=self.file_path) + elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG: + from unstructured.partition.msg import partition_msg + + return partition_msg(filename=self.file_path) + else: + raise ValueError( + f"Filetype {filetype} is not supported in UnstructuredEmailLoader." + ) class OutlookMessageLoader(BaseLoader):