From 6e85cbcce3ba77574a2f119346a713f431c799b0 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 28 Mar 2023 13:27:52 -0700 Subject: [PATCH] Harrison/unstructured validation (#2111) Co-authored-by: kravetsmic <79907559+kravetsmic@users.noreply.github.com> --- langchain/document_loaders/url.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index 95db7726..a84e815a 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -11,17 +11,36 @@ logger = logging.getLogger(__file__) class UnstructuredURLLoader(BaseLoader): """Loader that uses unstructured to load HTML files.""" - def __init__(self, urls: List[str], continue_on_failure: bool = True): + def __init__( + self, urls: List[str], continue_on_failure: bool = True, headers: dict = {} + ): """Initialize with file path.""" try: import unstructured # noqa:F401 + from unstructured.__version__ import __version__ as __unstructured_version__ + + self.__version = __unstructured_version__ except ImportError: raise ValueError( "unstructured package not found, please install it with " "`pip install unstructured`" ) + + if not self.__is_headers_available() and len(headers.keys()) != 0: + logger.warning( + "You are using old version of unstructured. " + "The headers parameter is ignored" + ) + self.urls = urls self.continue_on_failure = continue_on_failure + self.headers = headers + + def __is_headers_available(self) -> bool: + _unstructured_version = self.__version.split("-")[0] + unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) + + return unstructured_version >= (0, 5, 7) def load(self) -> List[Document]: """Load file.""" @@ -30,7 +49,10 @@ class UnstructuredURLLoader(BaseLoader): docs: List[Document] = list() for url in self.urls: try: - elements = partition_html(url=url) + if self.__is_headers_available(): + elements = partition_html(url=url, headers=self.headers) + else: + elements = partition_html(url=url) except Exception as e: if self.continue_on_failure: logger.error(f"Error fetching or processing {url}, exeption: {e}")