From 3e0c44bae82e143fd4870c7489df2ee7b4a4b38d Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 19 Apr 2023 19:16:24 -0400 Subject: [PATCH] enhancement: support headers for non-html urls (#3166) ### Summary Updates the `UnstructuredURLLoader` to support passing in headers for non HTML content types. While this update maintains backward compatibility with older versions of `unstructured`, we strongly recommended upgrading to `unstructured>=0.5.13` if you are using the `UnstructuredURLLoader`. ### Testing #### With headers ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ``` #### Without headers ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ``` --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com> --- langchain/document_loaders/url.py | 47 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index d8b25c37..3f52c2b8 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -15,7 +15,6 @@ class UnstructuredURLLoader(BaseLoader): self, urls: List[str], continue_on_failure: bool = True, - headers: dict = {}, **unstructured_kwargs: Any, ): """Initialize with file path.""" @@ -30,23 +29,37 @@ class UnstructuredURLLoader(BaseLoader): "`pip install unstructured`" ) - if not self.__is_headers_available() and len(headers.keys()) != 0: - logger.warning( - "You are using old version of unstructured. " - "The headers parameter is ignored" - ) + headers = unstructured_kwargs.pop("headers", {}) + if len(headers.keys()) != 0: + warn_about_headers = False + if self.__is_non_html_available(): + warn_about_headers = not self.__is_headers_available_for_non_html() + else: + warn_about_headers = not self.__is_headers_available_for_html() + + if warn_about_headers: + logger.warning( + "You are using an old version of unstructured. " + "The headers parameter is ignored" + ) self.urls = urls self.continue_on_failure = continue_on_failure self.headers = headers self.unstructured_kwargs = unstructured_kwargs - def __is_headers_available(self) -> bool: + def __is_headers_available_for_html(self) -> bool: _unstructured_version = self.__version.split("-")[0] unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) return unstructured_version >= (0, 5, 7) + def __is_headers_available_for_non_html(self) -> bool: + _unstructured_version = self.__version.split("-")[0] + unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) + + return unstructured_version >= (0, 5, 13) + def __is_non_html_available(self) -> bool: _unstructured_version = self.__version.split("-")[0] unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) @@ -61,14 +74,20 @@ class UnstructuredURLLoader(BaseLoader): docs: List[Document] = list() for url in self.urls: try: - if self.headers and self.__is_headers_available(): - elements = partition_html( - url=url, headers=self.headers, **self.unstructured_kwargs - ) - elif self.__is_non_html_available(): - elements = partition(url=url, **self.unstructured_kwargs) + if self.__is_non_html_available(): + if self.__is_headers_available_for_non_html(): + elements = partition( + url=url, headers=self.headers, **self.unstructured_kwargs + ) + else: + elements = partition(url=url, **self.unstructured_kwargs) else: - elements = partition_html(url=url, **self.unstructured_kwargs) + if self.__is_headers_available_for_html(): + elements = partition_html( + url=url, headers=self.headers, **self.unstructured_kwargs + ) + else: + elements = partition_html(url=url, **self.unstructured_kwargs) except Exception as e: if self.continue_on_failure: logger.error(f"Error fetching or processing {url}, exeption: {e}")