mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
feat: adds support for MSFT Outlook files in UnstructuredEmailLoader
(#2450)
### Summary Adds support for MSFT Outlook emails saved in `.msg` format to `UnstructuredEmailLoader`. Works if the user has `unstructured>=0.5.8` installed. ### Testing The following tests use the example files under `example-docs` in the Unstructured repo. ```python from langchain.document_loaders import UnstructuredEmailLoader loader = UnstructuredEmailLoader("fake-email.eml") loader.load() loader = UnstructuredEmailLoader("fake-email.msg") loader.load() ```
This commit is contained in:
parent
007babb363
commit
1140bd79a0
@ -4,16 +4,32 @@ from typing import List
|
|||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.base import BaseLoader
|
from langchain.document_loaders.base import BaseLoader
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import (
|
||||||
|
UnstructuredFileLoader,
|
||||||
|
satisfies_min_unstructured_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load email files."""
|
"""Loader that uses unstructured to load email files."""
|
||||||
|
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
|
from unstructured.file_utils.filetype import FileType, detect_filetype
|
||||||
|
|
||||||
|
filetype = detect_filetype(self.file_path)
|
||||||
|
|
||||||
|
if filetype == FileType.EML:
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
|
|
||||||
return partition_email(filename=self.file_path)
|
return partition_email(filename=self.file_path)
|
||||||
|
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
|
||||||
|
from unstructured.partition.msg import partition_msg
|
||||||
|
|
||||||
|
return partition_msg(filename=self.file_path)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class OutlookMessageLoader(BaseLoader):
|
class OutlookMessageLoader(BaseLoader):
|
||||||
|
Loading…
Reference in New Issue
Block a user