From 3e0c44bae82e143fd4870c7489df2ee7b4a4b38d Mon Sep 17 00:00:00 2001
From: Matt Robinson <mthw.wm.robinson@gmail.com>
Date: Wed, 19 Apr 2023 19:16:24 -0400
Subject: [PATCH] enhancement: support headers for non-html urls (#3166)

### Summary

Updates the `UnstructuredURLLoader` to support passing in headers for
non HTML content types. While this update maintains backward
compatibility with older versions of `unstructured`, we strongly
recommended upgrading to `unstructured>=0.5.13` if you are using the
`UnstructuredURLLoader`.

### Testing

#### With headers

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
```

#### Without headers

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
```

---------

Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
---
 langchain/document_loaders/url.py | 47 ++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py
index d8b25c37..3f52c2b8 100644
--- a/langchain/document_loaders/url.py
+++ b/langchain/document_loaders/url.py
@@ -15,7 +15,6 @@ class UnstructuredURLLoader(BaseLoader):
         self,
         urls: List[str],
         continue_on_failure: bool = True,
-        headers: dict = {},
         **unstructured_kwargs: Any,
     ):
         """Initialize with file path."""
@@ -30,23 +29,37 @@ class UnstructuredURLLoader(BaseLoader):
                 "`pip install unstructured`"
             )
 
-        if not self.__is_headers_available() and len(headers.keys()) != 0:
-            logger.warning(
-                "You are using old version of unstructured. "
-                "The headers parameter is ignored"
-            )
+        headers = unstructured_kwargs.pop("headers", {})
+        if len(headers.keys()) != 0:
+            warn_about_headers = False
+            if self.__is_non_html_available():
+                warn_about_headers = not self.__is_headers_available_for_non_html()
+            else:
+                warn_about_headers = not self.__is_headers_available_for_html()
+
+            if warn_about_headers:
+                logger.warning(
+                    "You are using an old version of unstructured. "
+                    "The headers parameter is ignored"
+                )
 
         self.urls = urls
         self.continue_on_failure = continue_on_failure
         self.headers = headers
         self.unstructured_kwargs = unstructured_kwargs
 
-    def __is_headers_available(self) -> bool:
+    def __is_headers_available_for_html(self) -> bool:
         _unstructured_version = self.__version.split("-")[0]
         unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
 
         return unstructured_version >= (0, 5, 7)
 
+    def __is_headers_available_for_non_html(self) -> bool:
+        _unstructured_version = self.__version.split("-")[0]
+        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
+
+        return unstructured_version >= (0, 5, 13)
+
     def __is_non_html_available(self) -> bool:
         _unstructured_version = self.__version.split("-")[0]
         unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
@@ -61,14 +74,20 @@ class UnstructuredURLLoader(BaseLoader):
         docs: List[Document] = list()
         for url in self.urls:
             try:
-                if self.headers and self.__is_headers_available():
-                    elements = partition_html(
-                        url=url, headers=self.headers, **self.unstructured_kwargs
-                    )
-                elif self.__is_non_html_available():
-                    elements = partition(url=url, **self.unstructured_kwargs)
+                if self.__is_non_html_available():
+                    if self.__is_headers_available_for_non_html():
+                        elements = partition(
+                            url=url, headers=self.headers, **self.unstructured_kwargs
+                        )
+                    else:
+                        elements = partition(url=url, **self.unstructured_kwargs)
                 else:
-                    elements = partition_html(url=url, **self.unstructured_kwargs)
+                    if self.__is_headers_available_for_html():
+                        elements = partition_html(
+                            url=url, headers=self.headers, **self.unstructured_kwargs
+                        )
+                    else:
+                        elements = partition_html(url=url, **self.unstructured_kwargs)
             except Exception as e:
                 if self.continue_on_failure:
                     logger.error(f"Error fetching or processing {url}, exeption: {e}")