From f0be3b0689a6317a66f057aae3d7e3491100f4b7 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 12 Apr 2023 20:06:28 -0400 Subject: [PATCH] feat: add support for non-html in `UnstructuredURLLoader` (#2793) ### Summary Adds support for processing non HTML document types in the URL loader. For example, the URL loader can now process a PDF or markdown files hosted at a URL. ### Testing ```python from langchain.document_loaders import UnstructuredURLLoader urls = ["https://www.understandingwar.org/sites/default/files/Russian%20Offensive%20Campaign%20Assessment%2C%20April%2011%2C%202023.pdf"] loader = UnstructuredURLLoader(urls=urls, strategy="fast") docs = loader.load() print(docs[0].page_content[:1000]) ``` --- langchain/document_loaders/url.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index c0dca346..9ab01e8f 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -47,17 +47,26 @@ class UnstructuredURLLoader(BaseLoader): return unstructured_version >= (0, 5, 7) + def __is_non_html_available(self) -> bool: + _unstructured_version = self.__version.split("-")[0] + unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")]) + + return unstructured_version >= (0, 5, 12) + def load(self) -> List[Document]: """Load file.""" + from unstructured.partition.auto import partition from unstructured.partition.html import partition_html docs: List[Document] = list() for url in self.urls: try: - if self.__is_headers_available(): + if self.headers and self.__is_headers_available(): elements = partition_html( url=url, headers=self.headers, **self.unstructured_kwargs ) + elif self.__is_non_html_available(): + elements = partition(url=url, **self.unstructured_kwargs) else: elements = partition_html(url=url, **self.unstructured_kwargs) except Exception as e: