fix: pass unstructured kwargs down in all unstructured loaders (#2506)

### Summary

#1667 updated several Unstructured loaders to accept
`unstructured_kwargs` in the `__init__` function. However, the previous
PR did not add this functionality to every Unstructured loader. This PR
ensures `unstructured_kwargs` are passed in all remaining Unstructured
loaders.
This commit is contained in:
Matt Robinson 2023-04-06 15:29:52 -04:00 committed by GitHub
parent c913acdb4c
commit 270384fb44
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 15 additions and 8 deletions

View File

@ -21,11 +21,11 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
if filetype == FileType.EML:
from unstructured.partition.email import partition_email
return partition_email(filename=self.file_path)
return partition_email(filename=self.file_path, **self.unstructured_kwargs)
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
from unstructured.partition.msg import partition_msg
return partition_msg(filename=self.file_path)
return partition_msg(filename=self.file_path, **self.unstructured_kwargs)
else:
raise ValueError(
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."

View File

@ -19,4 +19,4 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
)
from unstructured.partition.epub import partition_epub
return partition_epub(filename=self.file_path)
return partition_epub(filename=self.file_path, **self.unstructured_kwargs)

View File

@ -22,4 +22,4 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
"Partitioning markdown files is only supported in unstructured>=0.4.16."
)
return partition_md(filename=self.file_path)
return partition_md(filename=self.file_path, **self.unstructured_kwargs)

View File

@ -1,6 +1,6 @@
"""Loader that uses unstructured to load HTML files."""
import logging
from typing import List
from typing import Any, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -12,7 +12,11 @@ class UnstructuredURLLoader(BaseLoader):
"""Loader that uses unstructured to load HTML files."""
def __init__(
self, urls: List[str], continue_on_failure: bool = True, headers: dict = {}
self,
urls: List[str],
continue_on_failure: bool = True,
headers: dict = {},
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
try:
@ -35,6 +39,7 @@ class UnstructuredURLLoader(BaseLoader):
self.urls = urls
self.continue_on_failure = continue_on_failure
self.headers = headers
self.unstructured_kwargs = unstructured_kwargs
def __is_headers_available(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
@ -50,9 +55,11 @@ class UnstructuredURLLoader(BaseLoader):
for url in self.urls:
try:
if self.__is_headers_available():
elements = partition_html(url=url, headers=self.headers)
elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else:
elements = partition_html(url=url)
elements = partition_html(url=url, **self.unstructured_kwargs)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching or processing {url}, exeption: {e}")