diff --git a/langchain/document_loaders/email.py b/langchain/document_loaders/email.py index bd30757493..ce0e281705 100644 --- a/langchain/document_loaders/email.py +++ b/langchain/document_loaders/email.py @@ -21,11 +21,11 @@ class UnstructuredEmailLoader(UnstructuredFileLoader): if filetype == FileType.EML: from unstructured.partition.email import partition_email - return partition_email(filename=self.file_path) + return partition_email(filename=self.file_path, **self.unstructured_kwargs) elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG: from unstructured.partition.msg import partition_msg - return partition_msg(filename=self.file_path) + return partition_msg(filename=self.file_path, **self.unstructured_kwargs) else: raise ValueError( f"Filetype {filetype} is not supported in UnstructuredEmailLoader." diff --git a/langchain/document_loaders/epub.py b/langchain/document_loaders/epub.py index 23b8f54182..91ea1a14a9 100644 --- a/langchain/document_loaders/epub.py +++ b/langchain/document_loaders/epub.py @@ -19,4 +19,4 @@ class UnstructuredEPubLoader(UnstructuredFileLoader): ) from unstructured.partition.epub import partition_epub - return partition_epub(filename=self.file_path) + return partition_epub(filename=self.file_path, **self.unstructured_kwargs) diff --git a/langchain/document_loaders/markdown.py b/langchain/document_loaders/markdown.py index c049e8ff6d..db7b8094d8 100644 --- a/langchain/document_loaders/markdown.py +++ b/langchain/document_loaders/markdown.py @@ -22,4 +22,4 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader): "Partitioning markdown files is only supported in unstructured>=0.4.16." ) - return partition_md(filename=self.file_path) + return partition_md(filename=self.file_path, **self.unstructured_kwargs) diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index a84e815a5e..a94c85e564 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -1,6 +1,6 @@ """Loader that uses unstructured to load HTML files.""" import logging -from typing import List +from typing import Any, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -12,7 +12,11 @@ class UnstructuredURLLoader(BaseLoader): """Loader that uses unstructured to load HTML files.""" def __init__( - self, urls: List[str], continue_on_failure: bool = True, headers: dict = {} + self, + urls: List[str], + continue_on_failure: bool = True, + headers: dict = {}, + **unstructured_kwargs: Any, ): """Initialize with file path.""" try: @@ -35,6 +39,7 @@ class UnstructuredURLLoader(BaseLoader): self.urls = urls self.continue_on_failure = continue_on_failure self.headers = headers + self.unstructured_kwargs = unstructured_kwargs def __is_headers_available(self) -> bool: _unstructured_version = self.__version.split("-")[0] @@ -50,9 +55,11 @@ class UnstructuredURLLoader(BaseLoader): for url in self.urls: try: if self.__is_headers_available(): - elements = partition_html(url=url, headers=self.headers) + elements = partition_html( + url=url, headers=self.headers, **self.unstructured_kwargs + ) else: - elements = partition_html(url=url) + elements = partition_html(url=url, **self.unstructured_kwargs) except Exception as e: if self.continue_on_failure: logger.error(f"Error fetching or processing {url}, exeption: {e}")