feat: add support for non-html in UnstructuredURLLoader (#2793)

### Summary

Adds support for processing non HTML document types in the URL loader.
For example, the URL loader can now process a PDF or markdown files
hosted at a URL.

### Testing

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"]

loader = UnstructuredURLLoader(urls=urls, strategy="fast")
docs = loader.load()
print(docs[0].page_content[:1000])
```
This commit is contained in:
Matt Robinson 2023-04-12 20:06:28 -04:00 committed by GitHub
parent e081c62aac
commit f0be3b0689
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -47,17 +47,26 @@ class UnstructuredURLLoader(BaseLoader):
return unstructured_version >= (0, 5, 7)
def __is_non_html_available(self) -> bool:
_unstructured_version = self.__version.split("-")[0]
unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
return unstructured_version >= (0, 5, 12)
def load(self) -> List[Document]:
"""Load file."""
from unstructured.partition.auto import partition
from unstructured.partition.html import partition_html
docs: List[Document] = list()
for url in self.urls:
try:
if self.__is_headers_available():
if self.headers and self.__is_headers_available():
elements = partition_html(
url=url, headers=self.headers, **self.unstructured_kwargs
)
elif self.__is_non_html_available():
elements = partition(url=url, **self.unstructured_kwargs)
else:
elements = partition_html(url=url, **self.unstructured_kwargs)
except Exception as e: