mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
feat: adds support for MSFT Outlook files in UnstructuredEmailLoader
(#2450)
### Summary Adds support for MSFT Outlook emails saved in `.msg` format to `UnstructuredEmailLoader`. Works if the user has `unstructured>=0.5.8` installed. ### Testing The following tests use the example files under `example-docs` in the Unstructured repo. ```python from langchain.document_loaders import UnstructuredEmailLoader loader = UnstructuredEmailLoader("fake-email.eml") loader.load() loader = UnstructuredEmailLoader("fake-email.msg") loader.load() ```
This commit is contained in:
parent
007babb363
commit
1140bd79a0
@ -4,16 +4,32 @@ from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
from langchain.document_loaders.unstructured import (
|
||||
UnstructuredFileLoader,
|
||||
satisfies_min_unstructured_version,
|
||||
)
|
||||
|
||||
|
||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load email files."""
|
||||
|
||||
def _get_elements(self) -> List:
|
||||
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||
|
||||
filetype = detect_filetype(self.file_path)
|
||||
|
||||
if filetype == FileType.EML:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
return partition_email(filename=self.file_path)
|
||||
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
return partition_msg(filename=self.file_path)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
|
||||
)
|
||||
|
||||
|
||||
class OutlookMessageLoader(BaseLoader):
|
||||
|
Loading…
Reference in New Issue
Block a user