enhancement: support headers for non-html urls (#3166)

### Summary

Updates the `UnstructuredURLLoader` to support passing in headers for
non HTML content types. While this update maintains backward
compatibility with older versions of `unstructured`, we strongly
recommended upgrading to `unstructured>=0.5.13` if you are using the
`UnstructuredURLLoader`.

### Testing

#### With headers

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, headers={"Accept": "application/json"}, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
```

#### Without headers

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
```

---------

Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
fix_agent_callbacks
Matt Robinson 1 year ago committed by GitHub
parent 7b1f0656b8
commit 3e0c44bae8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -15,7 +15,6 @@ class UnstructuredURLLoader(BaseLoader):
self,
urls: List[str],
continue_on_failure: bool = True,
headers: dict = {},
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
@ -30,23 +29,37 @@ class UnstructuredURLLoader(BaseLoader):
"`pip install unstructured`"
)
if not self.__is_headers_available() and len(headers.keys()) != 0:
logger.warning(
"You are using old version of unstructured. "
"The headers parameter is ignored"
)
headers = unstructured_kwargs.pop("headers", {})
if len(headers.keys()) != 0:
warn_about_headers = False
if self.__is_non_html_available():
warn_about_headers = not self.__is_headers_available_for_non_html()
else:
warn_about_headers = not self.__is_headers_available_for_html()
if warn_about_headers:
logger.warning(
"You are using an old version of unstructured. "
"The headers parameter is ignored"
)
self.urls = urls
self.continue_on_failure = continue_on_failure
self.headers = headers
self.unstructured_kwargs = unstructured_kwargs
def __is_headers_available(self) -> bool:
def __is_headers_available_for_html(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 7)
def __is_headers_available_for_non_html(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 13)
def __is_non_html_available(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
@ -61,14 +74,20 @@ class UnstructuredURLLoader(BaseLoader):
docs: List[Document] = list()
for url in self.urls:
try:
if self.headers and self.__is_headers_available():
elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
elif self.__is_non_html_available():
elements = partition(url=url, **self.unstructured_kwargs)
if self.__is_non_html_available():
if self.__is_headers_available_for_non_html():
elements = partition(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else:
elements = partition(url=url, **self.unstructured_kwargs)
else:
elements = partition_html(url=url, **self.unstructured_kwargs)
if self.__is_headers_available_for_html():
elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
else:
elements = partition_html(url=url, **self.unstructured_kwargs)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching or processing {url}, exeption: {e}")

Loading…
Cancel
Save