feat: adds support for MSFT Outlook files in `UnstructuredEmailLoader` (#2450)

### Summary

Adds support for MSFT Outlook emails saved in `.msg` format to
`UnstructuredEmailLoader`. Works if the user has `unstructured>=0.5.8`
installed.

### Testing

The following tests use the example files under `example-docs` in the
Unstructured repo.

```python
from langchain.document_loaders import UnstructuredEmailLoader

loader = UnstructuredEmailLoader("fake-email.eml")
loader.load()

loader = UnstructuredEmailLoader("fake-email.msg")
loader.load()
```
doc
Matt Robinson 1 year ago committed by GitHub
parent 007babb363
commit 1140bd79a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -4,16 +4,32 @@ from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredEmailLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load email files."""
def _get_elements(self) -> List:
from unstructured.partition.email import partition_email
from unstructured.file_utils.filetype import FileType, detect_filetype
return partition_email(filename=self.file_path)
filetype = detect_filetype(self.file_path)
if filetype == FileType.EML:
from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path)
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
from unstructured.partition.msg import partition_msg
return partition_msg(filename=self.file_path)
else:
raise ValueError(
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
)
class OutlookMessageLoader(BaseLoader):

Loading…
Cancel
Save