mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
fix: pass unstructured kwargs down in all unstructured loaders (#2506)
### Summary #1667 updated several Unstructured loaders to accept `unstructured_kwargs` in the `__init__` function. However, the previous PR did not add this functionality to every Unstructured loader. This PR ensures `unstructured_kwargs` are passed in all remaining Unstructured loaders.
This commit is contained in:
parent
c913acdb4c
commit
270384fb44
@ -21,11 +21,11 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
if filetype == FileType.EML:
|
||||
from unstructured.partition.email import partition_email
|
||||
|
||||
return partition_email(filename=self.file_path)
|
||||
return partition_email(filename=self.file_path, **self.unstructured_kwargs)
|
||||
elif satisfies_min_unstructured_version("0.5.8") and filetype == FileType.MSG:
|
||||
from unstructured.partition.msg import partition_msg
|
||||
|
||||
return partition_msg(filename=self.file_path)
|
||||
return partition_msg(filename=self.file_path, **self.unstructured_kwargs)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Filetype {filetype} is not supported in UnstructuredEmailLoader."
|
||||
|
@ -19,4 +19,4 @@ class UnstructuredEPubLoader(UnstructuredFileLoader):
|
||||
)
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
return partition_epub(filename=self.file_path)
|
||||
return partition_epub(filename=self.file_path, **self.unstructured_kwargs)
|
||||
|
@ -22,4 +22,4 @@ class UnstructuredMarkdownLoader(UnstructuredFileLoader):
|
||||
"Partitioning markdown files is only supported in unstructured>=0.4.16."
|
||||
)
|
||||
|
||||
return partition_md(filename=self.file_path)
|
||||
return partition_md(filename=self.file_path, **self.unstructured_kwargs)
|
||||
|
@ -1,6 +1,6 @@
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
import logging
|
||||
from typing import List
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
@ -12,7 +12,11 @@ class UnstructuredURLLoader(BaseLoader):
|
||||
"""Loader that uses unstructured to load HTML files."""
|
||||
|
||||
def __init__(
|
||||
self, urls: List[str], continue_on_failure: bool = True, headers: dict = {}
|
||||
self,
|
||||
urls: List[str],
|
||||
continue_on_failure: bool = True,
|
||||
headers: dict = {},
|
||||
**unstructured_kwargs: Any,
|
||||
):
|
||||
"""Initialize with file path."""
|
||||
try:
|
||||
@ -35,6 +39,7 @@ class UnstructuredURLLoader(BaseLoader):
|
||||
self.urls = urls
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.headers = headers
|
||||
self.unstructured_kwargs = unstructured_kwargs
|
||||
|
||||
def __is_headers_available(self) -> bool:
|
||||
_unstructured_version = self.__version.split("-")[0]
|
||||
@ -50,9 +55,11 @@ class UnstructuredURLLoader(BaseLoader):
|
||||
for url in self.urls:
|
||||
try:
|
||||
if self.__is_headers_available():
|
||||
elements = partition_html(url=url, headers=self.headers)
|
||||
elements = partition_html(
|
||||
url=url, headers=self.headers, **self.unstructured_kwargs
|
||||
)
|
||||
else:
|
||||
elements = partition_html(url=url)
|
||||
elements = partition_html(url=url, **self.unstructured_kwargs)
|
||||
except Exception as e:
|
||||
if self.continue_on_failure:
|
||||
logger.error(f"Error fetching or processing {url}, exeption: {e}")
|
||||
|
Loading…
Reference in New Issue
Block a user