feat: add support for non-html in `UnstructuredURLLoader` (#2793)

### Summary Adds support for processing non HTML document types in the URL loader. For example, the URL loader can now process a PDF or markdown files hosted at a URL. ### Testing ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ```
1 year ago · f0be3b0689
parent e081c62aac
commit f0be3b0689
1 changed files with 10 additions and 1 deletions
--- a/langchain/document_loaders/url.py
+++ b/langchain/document_loaders/url.py
@ -47,17 +47,26 @@ class UnstructuredURLLoader(BaseLoader):

        return unstructured_version >= (0, 5, 7)

+    def __is_non_html_available(self) -> bool:
+        _unstructured_version = self.__version.split("-")[0]
+        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])
+
+        return unstructured_version >= (0, 5, 12)
+
    def load(self) -> List[Document]:
        """Load file."""
+        from unstructured.partition.auto import partition
        from unstructured.partition.html import partition_html

        docs: List[Document] = list()
        for url in self.urls:
            try:
-                if self.__is_headers_available():
+                if self.headers and self.__is_headers_available():
                    elements = partition_html(
                        url=url, headers=self.headers, **self.unstructured_kwargs
                    )
+                elif self.__is_non_html_available():
+                    elements = partition(url=url, **self.unstructured_kwargs)
                else:
                    elements = partition_html(url=url, **self.unstructured_kwargs)
            except Exception as e: