feat: batch multiple files in a single Unstructured API request (#4525)

### Submit Multiple Files to the Unstructured API

Enables batching multiple files into a single Unstructured API requests.
Support for requests with multiple files was added to both
`UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. Note that
if you submit multiple files in "single" mode, the result will be
concatenated into a single document. We recommend using this feature in
"elements" mode.

### Testing

The following should load both documents, using two of the example docs
from the integration tests folder.

```python
    from langchain.document_loaders import UnstructuredAPIFileLoader

    file_paths = ["examples/layout-parser-paper.pdf",  "examples/whatsapp_chat.txt"]

    loader = UnstructuredAPIFileLoader(
        file_paths=file_paths,
        api_key="FAKE_API_KEY",
        strategy="fast",
        mode="elements",
    )
    docs = loader.load()
```
searx_updates
Matt Robinson 1 year ago committed by GitHub
parent 0c3de0a0b3
commit bf3f554357
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -287,10 +287,118 @@
"docs[:5]"
]
},
{
"cell_type": "markdown",
"id": "b066cb5a",
"metadata": {},
"source": [
"## Unstructured API\n",
"\n",
"If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API. Note that currently (as of 11 May 2023) the Unstructured API is open, but it will soon require an API. The [Unstructured documentation](https://unstructured-io.github.io/) page will have instructions on how to generate an API key once theyre available. Check out the instructions [here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image) if youd like to self-host the Unstructured API or run it locally."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b50c70bc",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredAPIFileLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "12b6d2cf",
"metadata": {},
"outputs": [],
"source": [
"filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "39a9894d",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredAPIFileLoader(\n",
" file_path=filenames[0],\n",
" api_key=\"FAKE_API_KEY\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "386eb63c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "markdown",
"id": "94158999",
"metadata": {},
"source": [
"You can also batch multiple files through the Unstructured API in a single API using `UnstructuredAPIFileLoader`."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "79a18e7e",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredAPIFileLoader(\n",
" file_path=filenames,\n",
" api_key=\"FAKE_API_KEY\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a3d7c846",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='Lorem ipsum dolor sit amet.\\n\\nThis is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': ['example_data/fake.docx', 'example_data/fake-email.eml']})"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f52b04cb",
"id": "0e510495",
"metadata": {},
"outputs": [],
"source": []
@ -312,7 +420,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.8.13"
}
},
"nbformat": 4,

@ -23,7 +23,7 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader):
is_ppt = detect_filetype(self.file_path) == FileType.PPT
except ImportError:
_, extension = os.path.splitext(self.file_path)
_, extension = os.path.splitext(str(self.file_path))
is_ppt = extension == ".ppt"
if is_ppt and unstructured_version < (0, 4, 11):

@ -1,6 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from typing import IO, Any, List
from typing import IO, Any, List, Sequence, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -92,7 +93,10 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load files."""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
self,
file_path: Union[str, List[str]],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
self.file_path = file_path
@ -107,12 +111,48 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
return {"source": self.file_path}
def get_elements_from_api(
file_path: Union[str, List[str], None] = None,
file: Union[IO, Sequence[IO], None] = None,
api_url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
) -> List:
"""Retrieves a list of elements from the Unstructured API."""
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
from unstructured.partition.api import partition_multiple_via_api
_doc_elements = partition_multiple_via_api(
filenames=file_path,
files=file,
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
elements = []
for _elements in _doc_elements:
elements.extend(_elements)
return elements
else:
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=file_path,
file=file,
api_key=api_key,
api_url=api_url,
**unstructured_kwargs,
)
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
"""Loader that uses the unstructured web API to load files."""
def __init__(
self,
file_path: str,
file_path: Union[str, List[str]] = "",
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
@ -120,23 +160,22 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
):
"""Initialize with file path."""
min_unstructured_version = "0.6.2"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning via API is only supported in "
f"unstructured>={min_unstructured_version}."
)
if isinstance(file_path, str):
validate_unstructured_version(min_unstructured_version="0.6.2")
else:
validate_unstructured_version(min_unstructured_version="0.6.3")
self.url = url
self.api_key = api_key
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.api import partition_via_api
def _get_metadata(self) -> dict:
return {"source": self.file_path}
return partition_via_api(
filename=self.file_path,
def _get_elements(self) -> List:
return get_elements_from_api(
file_path=self.file_path,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
@ -146,7 +185,12 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load file IO objects."""
def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any):
def __init__(
self,
file: Union[IO, Sequence[IO]],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
self.file = file
super().__init__(mode=mode, **unstructured_kwargs)
@ -165,7 +209,7 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
def __init__(
self,
file: IO,
file: Union[IO, Sequence[IO]],
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
@ -173,21 +217,18 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
):
"""Initialize with file path."""
min_unstructured_version = "0.6.2"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning via API is only supported in "
f"unstructured>={min_unstructured_version}."
)
if isinstance(file, collections.abc.Sequence):
validate_unstructured_version(min_unstructured_version="0.6.3")
if file:
validate_unstructured_version(min_unstructured_version="0.6.2")
self.url = url
self.api_key = api_key
super().__init__(file=file, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.api import partition_via_api
return partition_via_api(
return get_elements_from_api(
file=self.file,
api_key=self.api_key,
api_url=self.url,

@ -82,7 +82,7 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
is_doc = detect_filetype(self.file_path) == FileType.DOC
except ImportError:
_, extension = os.path.splitext(self.file_path)
_, extension = os.path.splitext(str(self.file_path))
is_doc = extension == ".doc"
if is_doc and unstructured_version < (0, 4, 11):

@ -0,0 +1,82 @@
import os
from contextlib import ExitStack
from pathlib import Path
from langchain.document_loaders import (
UnstructuredAPIFileIOLoader,
UnstructuredAPIFileLoader,
)
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
def test_unstructured_api_file_loader() -> None:
"""Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
loader = UnstructuredAPIFileLoader(
file_path=file_path,
api_key="FAKE_API_KEY",
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
def test_unstructured_api_file_loader_multiple_files() -> None:
"""Test unstructured loader."""
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
]
loader = UnstructuredAPIFileLoader(
file_path=file_paths,
api_key="FAKE_API_KEY",
strategy="fast",
mode="elements",
)
docs = loader.load()
assert len(docs) > 1
def test_unstructured_api_file_io_loader() -> None:
"""Test unstructured loader."""
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
with open(file_path, "rb") as f:
loader = UnstructuredAPIFileIOLoader(
file=f,
api_key="FAKE_API_KEY",
strategy="fast",
mode="elements",
file_filename=file_path,
)
docs = loader.load()
assert len(docs) > 1
def test_unstructured_api_file_loader_io_multiple_files() -> None:
"""Test unstructured loader."""
file_paths = [
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
]
with ExitStack() as stack:
files = [stack.enter_context(open(file_path, "rb")) for file_path in file_paths]
loader = UnstructuredAPIFileIOLoader(
file=files, # type: ignore
api_key="FAKE_API_KEY",
strategy="fast",
mode="elements",
file_filenames=file_paths,
)
docs = loader.load()
assert len(docs) > 1
Loading…
Cancel
Save