feat: adds support for MSFT Outlook files in UnstructuredEmailLoader (#2450)

### Summary

Adds support for MSFT Outlook emails saved in `.msg` format to
`UnstructuredEmailLoader`. Works if the user has `unstructured>=0.5.8`
installed.

### Testing

The following tests use the example files under `example-docs` in the
Unstructured repo.

```python
from langchain.document_loaders import UnstructuredEmailLoader

loader = UnstructuredEmailLoader("fake-email.eml")
loader.load()

loader = UnstructuredEmailLoader("fake-email.msg")
loader.load()
```
This commit is contained in:
Matt Robinson 2023-04-05 18:28:14 -04:00 committed by GitHub
parent 007babb363
commit 1140bd79a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,16 +4,32 @@ from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
satisfies_min_unstructured_version,
)
class UnstructuredEmailLoader(UnstructuredFileLoader): class UnstructuredEmailLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load email files.""" """Loader that uses unstructured to load email files."""
def _get_elements(self) -> List: def _get_elements(self) -> List:
from unstructured.file_utils.filetype import FileType, detect_filetype
filetype = detect_filetype(self.file_path)
if filetype == FileType.EML:
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path) return partition_email(filename=self.file_path)
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
from unstructured.partition.msg import partition_msg
return partition_msg(filename=self.file_path)
else:
raise ValueError(
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
)
class OutlookMessageLoader(BaseLoader): class OutlookMessageLoader(BaseLoader):