feat: batch multiple files in a single Unstructured API request (#4525)

### Submit Multiple Files to the Unstructured API Enables batching multiple files into a single Unstructured API requests. Support for requests with multiple files was added to both `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. Note that if you submit multiple files in "single" mode, the result will be concatenated into a single document. We recommend using this feature in "elements" mode. ### Testing The following should load both documents, using two of the example docs from the integration tests folder. ```python from langchain.document_loaders import UnstructuredAPIFileLoader file_paths = ["examples/layout-parser-paper.pdf", "examples/whatsapp_chat.txt"] loader = UnstructuredAPIFileLoader( file_paths=file_paths, api_key="FAKE_API_KEY", strategy="fast", mode="elements", ) docs = loader.load() ```
1 year ago · bf3f554357
parent 0c3de0a0b3
commit bf3f554357
5 changed files with 259 additions and 28 deletions
--- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb
@ -287,10 +287,118 @@
    "docs[:5]"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "b066cb5a",
+   "metadata": {},
+   "source": [
+    "## Unstructured API\n",
+    "\n",
+    "If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API. Note that currently (as of 11 May 2023) the Unstructured API is open, but it will soon require an API. The [Unstructured documentation](https://unstructured-io.github.io/) page will have instructions on how to generate an API key once they’re available. Check out the instructions [here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image) if you’d like to self-host the Unstructured API or run it locally."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b50c70bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import UnstructuredAPIFileLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "12b6d2cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "39a9894d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredAPIFileLoader(\n",
+    "    file_path=filenames[0],\n",
+    "    api_key=\"FAKE_API_KEY\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "386eb63c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs = loader.load()\n",
+    "docs[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94158999",
+   "metadata": {},
+   "source": [
+    "You can also batch multiple files through the Unstructured API in a single API using `UnstructuredAPIFileLoader`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "79a18e7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = UnstructuredAPIFileLoader(\n",
+    "    file_path=filenames,\n",
+    "    api_key=\"FAKE_API_KEY\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a3d7c846",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Lorem ipsum dolor sit amet.\\n\\nThis is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': ['example_data/fake.docx', 'example_data/fake-email.eml']})"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs = loader.load()\n",
+    "docs[0]"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "f52b04cb",
+   "id": "0e510495",
   "metadata": {},
   "outputs": [],
   "source": []
@ -312,7 +420,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.13"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -23,7 +23,7 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):

            is_ppt = detect_filetype(self.file_path) == FileType.PPT
        except ImportError:
-            _, extension = os.path.splitext(self.file_path)
+            _, extension = os.path.splitext(str(self.file_path))
            is_ppt = extension == ".ppt"

        if is_ppt and unstructured_version < (0, 4, 11):
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -1,6 +1,7 @@
 """Loader that uses unstructured to load files."""
+import collections
 from abc import ABC, abstractmethod
-from typing import IO, Any, List
+from typing import IO, Any, List, Sequence, Union

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@ -92,7 +93,10 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
    """Loader that uses unstructured to load files."""

    def __init__(
-        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+        self,
+        file_path: Union[str, List[str]],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
    ):
        """Initialize with file path."""
        self.file_path = file_path
@ -107,12 +111,48 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
        return {"source": self.file_path}


+def get_elements_from_api(
+    file_path: Union[str, List[str], None] = None,
+    file: Union[IO, Sequence[IO], None] = None,
+    api_url: str = "https://api.unstructured.io/general/v0/general",
+    api_key: str = "",
+    **unstructured_kwargs: Any,
+) -> List:
+    """Retrieves a list of elements from the Unstructured API."""
+    if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
+        from unstructured.partition.api import partition_multiple_via_api
+
+        _doc_elements = partition_multiple_via_api(
+            filenames=file_path,
+            files=file,
+            api_key=api_key,
+            api_url=api_url,
+            **unstructured_kwargs,
+        )
+
+        elements = []
+        for _elements in _doc_elements:
+            elements.extend(_elements)
+
+        return elements
+    else:
+        from unstructured.partition.api import partition_via_api
+
+        return partition_via_api(
+            filename=file_path,
+            file=file,
+            api_key=api_key,
+            api_url=api_url,
+            **unstructured_kwargs,
+        )
+
+
 class UnstructuredAPIFileLoader(UnstructuredFileLoader):
    """Loader that uses the unstructured web API to load files."""

    def __init__(
        self,
-        file_path: str,
+        file_path: Union[str, List[str]] = "",
        mode: str = "single",
        url: str = "https://api.unstructured.io/general/v0/general",
        api_key: str = "",
@ -120,23 +160,22 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
    ):
        """Initialize with file path."""

-        min_unstructured_version = "0.6.2"
-        if not satisfies_min_unstructured_version(min_unstructured_version):
-            raise ValueError(
-                "Partitioning via API is only supported in "
-                f"unstructured>={min_unstructured_version}."
-            )
+        if isinstance(file_path, str):
+            validate_unstructured_version(min_unstructured_version="0.6.2")
+        else:
+            validate_unstructured_version(min_unstructured_version="0.6.3")

        self.url = url
        self.api_key = api_key

        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

-    def _get_elements(self) -> List:
-        from unstructured.partition.api import partition_via_api
+    def _get_metadata(self) -> dict:
+        return {"source": self.file_path}

-        return partition_via_api(
-            filename=self.file_path,
+    def _get_elements(self) -> List:
+        return get_elements_from_api(
+            file_path=self.file_path,
            api_key=self.api_key,
            api_url=self.url,
            **self.unstructured_kwargs,
@ -146,7 +185,12 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
 class UnstructuredFileIOLoader(UnstructuredBaseLoader):
    """Loader that uses unstructured to load file IO objects."""

-    def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any):
+    def __init__(
+        self,
+        file: Union[IO, Sequence[IO]],
+        mode: str = "single",
+        **unstructured_kwargs: Any,
+    ):
        """Initialize with file path."""
        self.file = file
        super().__init__(mode=mode, **unstructured_kwargs)
@ -165,7 +209,7 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):

    def __init__(
        self,
-        file: IO,
+        file: Union[IO, Sequence[IO]],
        mode: str = "single",
        url: str = "https://api.unstructured.io/general/v0/general",
        api_key: str = "",
@ -173,21 +217,18 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
    ):
        """Initialize with file path."""

-        min_unstructured_version = "0.6.2"
-        if not satisfies_min_unstructured_version(min_unstructured_version):
-            raise ValueError(
-                "Partitioning via API is only supported in "
-                f"unstructured>={min_unstructured_version}."
-            )
+        if isinstance(file, collections.abc.Sequence):
+            validate_unstructured_version(min_unstructured_version="0.6.3")
+        if file:
+            validate_unstructured_version(min_unstructured_version="0.6.2")

        self.url = url
        self.api_key = api_key
+
        super().__init__(file=file, mode=mode, **unstructured_kwargs)

    def _get_elements(self) -> List:
-        from unstructured.partition.api import partition_via_api
-
-        return partition_via_api(
+        return get_elements_from_api(
            file=self.file,
            api_key=self.api_key,
            api_url=self.url,
--- a/langchain/document_loaders/word_document.py
+++ b/langchain/document_loaders/word_document.py
@ -82,7 +82,7 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):

            is_doc = detect_filetype(self.file_path) == FileType.DOC
        except ImportError:
-            _, extension = os.path.splitext(self.file_path)
+            _, extension = os.path.splitext(str(self.file_path))
            is_doc = extension == ".doc"

        if is_doc and unstructured_version < (0, 4, 11):
--- a/tests/integration_tests/document_loaders/test_unstructured.py
+++ b/tests/integration_tests/document_loaders/test_unstructured.py
@ -0,0 +1,82 @@
+import os
+from contextlib import ExitStack
+from pathlib import Path
+
+from langchain.document_loaders import (
+    UnstructuredAPIFileIOLoader,
+    UnstructuredAPIFileLoader,
+)
+
+EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
+
+
+def test_unstructured_api_file_loader() -> None:
+    """Test unstructured loader."""
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
+    loader = UnstructuredAPIFileLoader(
+        file_path=file_path,
+        api_key="FAKE_API_KEY",
+        strategy="fast",
+        mode="elements",
+    )
+    docs = loader.load()
+
+    assert len(docs) > 1
+
+
+def test_unstructured_api_file_loader_multiple_files() -> None:
+    """Test unstructured loader."""
+    file_paths = [
+        os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
+        os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
+    ]
+
+    loader = UnstructuredAPIFileLoader(
+        file_path=file_paths,
+        api_key="FAKE_API_KEY",
+        strategy="fast",
+        mode="elements",
+    )
+    docs = loader.load()
+
+    assert len(docs) > 1
+
+
+def test_unstructured_api_file_io_loader() -> None:
+    """Test unstructured loader."""
+    file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
+
+    with open(file_path, "rb") as f:
+        loader = UnstructuredAPIFileIOLoader(
+            file=f,
+            api_key="FAKE_API_KEY",
+            strategy="fast",
+            mode="elements",
+            file_filename=file_path,
+        )
+        docs = loader.load()
+
+    assert len(docs) > 1
+
+
+def test_unstructured_api_file_loader_io_multiple_files() -> None:
+    """Test unstructured loader."""
+    file_paths = [
+        os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
+        os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
+    ]
+
+    with ExitStack() as stack:
+        files = [stack.enter_context(open(file_path, "rb")) for file_path in file_paths]
+
+        loader = UnstructuredAPIFileIOLoader(
+            file=files,  # type: ignore
+            api_key="FAKE_API_KEY",
+            strategy="fast",
+            mode="elements",
+            file_filenames=file_paths,
+        )
+
+        docs = loader.load()
+
+    assert len(docs) > 1