feat: add Unstructured API loaders (#3906)

### Summary Adds `UnstructuredAPIFileLoaders` and `UnstructuredAPIFIleIOLoaders` that partition documents through the Unstructured API. Defaults to the URL for hosted Unstructured API, but can switch to a self hosted or locally running API using the `url` kwarg. Currently, the Unstructured API is open and does not require an API, but it will soon. A note was added about that to the Unstructured ecosystem page. ### Testing ```python from langchain.document_loaders import UnstructuredAPIFileIOLoader filename = "fake-email.eml" with open(filename, "rb") as f: loader = UnstructuredAPIFileIOLoader(file=f, file_filename=filename) docs = loader.load() docs[0] ``` ```python from langchain.document_loaders import UnstructuredAPIFileLoader filename = "fake-email.eml" loader = UnstructuredAPIFileLoader(file_path=filename, mode="elements") docs = loader.load() docs[0] ```
2023-05-01 23:37:35 -04:00 · 2023-05-01 23:37:35 -04:00 · c51dec5101
commit c51dec5101
parent 13269fb583
3 changed files with 88 additions and 0 deletions
--- a/docs/ecosystem/unstructured.md
+++ b/docs/ecosystem/unstructured.md
@ -10,6 +10,10 @@ This page is broken into two parts: installation and setup, and then references
 `unstructured` wrappers.

 ## Installation and Setup
+
+If you are using a loader that runs locally, use the following steps to get `unstructured` and
+its dependencies running locally.
+
 - Install the Python SDK with `pip install "unstructured[local-inference]"`
 - Install the following system dependencies if they are not already available on your system.
  Depending on what document types you're parsing, you may not need all of these.
@ -25,6 +29,15 @@ This page is broken into two parts: installation and setup, and then references
      using the `"fast"` strategy, which uses `pdfminer` directly and doesn't require
      `detectron2`.

+If you want to get up and running with less set up, you can
+simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or
+`UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API.
+Note that currently (as of 1 May 2023) the Unstructured API is open, but it will soon require
+an API. The [Unstructured documentation page](https://unstructured-io.github.io/) will have
+instructions on how to generate an API key once they're available. Check out the instructions
+[here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image)
+if you'd like to self-host the Unstructured API or run it locally.
+
 ## Wrappers

 ### Data Loaders
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -77,6 +77,8 @@ from langchain.document_loaders.telegram import TelegramChatLoader
 from langchain.document_loaders.text import TextLoader
 from langchain.document_loaders.twitter import TwitterTweetLoader
 from langchain.document_loaders.unstructured import (
+    UnstructuredAPIFileIOLoader,
+    UnstructuredAPIFileLoader,
    UnstructuredFileIOLoader,
    UnstructuredFileLoader,
 )
@ -164,7 +166,9 @@ __all__ = [
    "TwitterTweetLoader",
    "UnstructuredEPubLoader",
    "UnstructuredEmailLoader",
+    "UnstructuredAPIFileIOLoader",
    "UnstructuredFileIOLoader",
+    "UnstructuredAPIFileLoader",
    "UnstructuredFileLoader",
    "UnstructuredHTMLLoader",
    "UnstructuredImageLoader",
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -98,6 +98,42 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
        return {"source": self.file_path}


+class UnstructuredAPIFileLoader(UnstructuredFileLoader):
+    """Loader that uses the unstructured web API to load files."""
+
+    def __init__(
+        self,
+        file_path: str,
+        mode: str = "single",
+        url: str = "https://api.unstructured.io/general/v0/general",
+        api_key: str = "",
+        **unstructured_kwargs: Any,
+    ):
+        """Initialize with file path."""
+
+        min_unstructured_version = "0.6.2"
+        if not satisfies_min_unstructured_version(min_unstructured_version):
+            raise ValueError(
+                "Partitioning via API is only supported in "
+                f"unstructured>={min_unstructured_version}."
+            )
+
+        self.url = url
+        self.api_key = api_key
+
+        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.api import partition_via_api
+
+        return partition_via_api(
+            filename=self.file_path,
+            api_key=self.api_key,
+            api_url=self.url,
+            **self.unstructured_kwargs,
+        )
+
+
 class UnstructuredFileIOLoader(UnstructuredBaseLoader):
    """Loader that uses unstructured to load file IO objects."""

@ -113,3 +149,38 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):

    def _get_metadata(self) -> dict:
        return {}
+
+
+class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
+    """Loader that uses the unstructured web API to load file IO objects."""
+
+    def __init__(
+        self,
+        file: IO,
+        mode: str = "single",
+        url: str = "https://api.unstructured.io/general/v0/general",
+        api_key: str = "",
+        **unstructured_kwargs: Any,
+    ):
+        """Initialize with file path."""
+
+        min_unstructured_version = "0.6.2"
+        if not satisfies_min_unstructured_version(min_unstructured_version):
+            raise ValueError(
+                "Partitioning via API is only supported in "
+                f"unstructured>={min_unstructured_version}."
+            )
+
+        self.url = url
+        self.api_key = api_key
+        super().__init__(file=file, mode=mode, **unstructured_kwargs)
+
+    def _get_elements(self) -> List:
+        from unstructured.partition.api import partition_via_api
+
+        return partition_via_api(
+            file=self.file,
+            api_key=self.api_key,
+            api_url=self.url,
+            **self.unstructured_kwargs,
+        )