From bf3f554357e2be0674b6fdf726c31c8ccdcf8224 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Sun, 21 May 2023 23:48:20 -0400 Subject: [PATCH] feat: batch multiple files in a single Unstructured API request (#4525) ### Submit Multiple Files to the Unstructured API Enables batching multiple files into a single Unstructured API requests. Support for requests with multiple files was added to both `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. Note that if you submit multiple files in "single" mode, the result will be concatenated into a single document. We recommend using this feature in "elements" mode. ### Testing The following should load both documents, using two of the example docs from the integration tests folder. ```python from langchain.document_loaders import UnstructuredAPIFileLoader file_paths = ["examples/layout-parser-paper.pdf", "examples/whatsapp_chat.txt"] loader = UnstructuredAPIFileLoader( file_paths=file_paths, api_key="FAKE_API_KEY", strategy="fast", mode="elements", ) docs = loader.load() ``` --- .../examples/unstructured_file.ipynb | 112 +++++++++++++++++- langchain/document_loaders/powerpoint.py | 2 +- langchain/document_loaders/unstructured.py | 89 ++++++++++---- langchain/document_loaders/word_document.py | 2 +- .../document_loaders/test_unstructured.py | 82 +++++++++++++ 5 files changed, 259 insertions(+), 28 deletions(-) create mode 100644 tests/integration_tests/document_loaders/test_unstructured.py diff --git a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb index c79868ec..e391f1ac 100644 --- a/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb +++ b/docs/modules/indexes/document_loaders/examples/unstructured_file.ipynb @@ -287,10 +287,118 @@ "docs[:5]" ] }, + { + "cell_type": "markdown", + "id": "b066cb5a", + "metadata": {}, + "source": [ + "## Unstructured API\n", + "\n", + "If you want to get up and running with less set up, you can simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or `UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API. Note that currently (as of 11 May 2023) the Unstructured API is open, but it will soon require an API. The [Unstructured documentation](https://unstructured-io.github.io/) page will have instructions on how to generate an API key once they’re available. Check out the instructions [here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image) if you’d like to self-host the Unstructured API or run it locally." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b50c70bc", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredAPIFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "12b6d2cf", + "metadata": {}, + "outputs": [], + "source": [ + "filenames = [\"example_data/fake.docx\", \"example_data/fake-email.eml\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "39a9894d", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredAPIFileLoader(\n", + " file_path=filenames[0],\n", + " api_key=\"FAKE_API_KEY\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "386eb63c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "markdown", + "id": "94158999", + "metadata": {}, + "source": [ + "You can also batch multiple files through the Unstructured API in a single API using `UnstructuredAPIFileLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "79a18e7e", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredAPIFileLoader(\n", + " file_path=filenames,\n", + " api_key=\"FAKE_API_KEY\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3d7c846", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='Lorem ipsum dolor sit amet.\\n\\nThis is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', metadata={'source': ['example_data/fake.docx', 'example_data/fake-email.eml']})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "f52b04cb", + "id": "0e510495", "metadata": {}, "outputs": [], "source": [] @@ -312,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index 9c49be2a..be6e67ab 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -23,7 +23,7 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): is_ppt = detect_filetype(self.file_path) == FileType.PPT except ImportError: - _, extension = os.path.splitext(self.file_path) + _, extension = os.path.splitext(str(self.file_path)) is_ppt = extension == ".ppt" if is_ppt and unstructured_version < (0, 4, 11): diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 276c8551..3e7e599d 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -1,6 +1,7 @@ """Loader that uses unstructured to load files.""" +import collections from abc import ABC, abstractmethod -from typing import IO, Any, List +from typing import IO, Any, List, Sequence, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -92,7 +93,10 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load files.""" def __init__( - self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + self, + file_path: Union[str, List[str]], + mode: str = "single", + **unstructured_kwargs: Any, ): """Initialize with file path.""" self.file_path = file_path @@ -107,12 +111,48 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): return {"source": self.file_path} +def get_elements_from_api( + file_path: Union[str, List[str], None] = None, + file: Union[IO, Sequence[IO], None] = None, + api_url: str = "https://api.unstructured.io/general/v0/general", + api_key: str = "", + **unstructured_kwargs: Any, +) -> List: + """Retrieves a list of elements from the Unstructured API.""" + if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list): + from unstructured.partition.api import partition_multiple_via_api + + _doc_elements = partition_multiple_via_api( + filenames=file_path, + files=file, + api_key=api_key, + api_url=api_url, + **unstructured_kwargs, + ) + + elements = [] + for _elements in _doc_elements: + elements.extend(_elements) + + return elements + else: + from unstructured.partition.api import partition_via_api + + return partition_via_api( + filename=file_path, + file=file, + api_key=api_key, + api_url=api_url, + **unstructured_kwargs, + ) + + class UnstructuredAPIFileLoader(UnstructuredFileLoader): """Loader that uses the unstructured web API to load files.""" def __init__( self, - file_path: str, + file_path: Union[str, List[str]] = "", mode: str = "single", url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", @@ -120,23 +160,22 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader): ): """Initialize with file path.""" - min_unstructured_version = "0.6.2" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning via API is only supported in " - f"unstructured>={min_unstructured_version}." - ) + if isinstance(file_path, str): + validate_unstructured_version(min_unstructured_version="0.6.2") + else: + validate_unstructured_version(min_unstructured_version="0.6.3") self.url = url self.api_key = api_key super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) - def _get_elements(self) -> List: - from unstructured.partition.api import partition_via_api + def _get_metadata(self) -> dict: + return {"source": self.file_path} - return partition_via_api( - filename=self.file_path, + def _get_elements(self) -> List: + return get_elements_from_api( + file_path=self.file_path, api_key=self.api_key, api_url=self.url, **self.unstructured_kwargs, @@ -146,7 +185,12 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader): class UnstructuredFileIOLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load file IO objects.""" - def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any): + def __init__( + self, + file: Union[IO, Sequence[IO]], + mode: str = "single", + **unstructured_kwargs: Any, + ): """Initialize with file path.""" self.file = file super().__init__(mode=mode, **unstructured_kwargs) @@ -165,7 +209,7 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): def __init__( self, - file: IO, + file: Union[IO, Sequence[IO]], mode: str = "single", url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", @@ -173,21 +217,18 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): ): """Initialize with file path.""" - min_unstructured_version = "0.6.2" - if not satisfies_min_unstructured_version(min_unstructured_version): - raise ValueError( - "Partitioning via API is only supported in " - f"unstructured>={min_unstructured_version}." - ) + if isinstance(file, collections.abc.Sequence): + validate_unstructured_version(min_unstructured_version="0.6.3") + if file: + validate_unstructured_version(min_unstructured_version="0.6.2") self.url = url self.api_key = api_key + super().__init__(file=file, mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: - from unstructured.partition.api import partition_via_api - - return partition_via_api( + return get_elements_from_api( file=self.file, api_key=self.api_key, api_url=self.url, diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py index 1cec1cce..f0272b2e 100644 --- a/langchain/document_loaders/word_document.py +++ b/langchain/document_loaders/word_document.py @@ -82,7 +82,7 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): is_doc = detect_filetype(self.file_path) == FileType.DOC except ImportError: - _, extension = os.path.splitext(self.file_path) + _, extension = os.path.splitext(str(self.file_path)) is_doc = extension == ".doc" if is_doc and unstructured_version < (0, 4, 11): diff --git a/tests/integration_tests/document_loaders/test_unstructured.py b/tests/integration_tests/document_loaders/test_unstructured.py new file mode 100644 index 00000000..c86abb22 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_unstructured.py @@ -0,0 +1,82 @@ +import os +from contextlib import ExitStack +from pathlib import Path + +from langchain.document_loaders import ( + UnstructuredAPIFileIOLoader, + UnstructuredAPIFileLoader, +) + +EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/") + + +def test_unstructured_api_file_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + loader = UnstructuredAPIFileLoader( + file_path=file_path, + api_key="FAKE_API_KEY", + strategy="fast", + mode="elements", + ) + docs = loader.load() + + assert len(docs) > 1 + + +def test_unstructured_api_file_loader_multiple_files() -> None: + """Test unstructured loader.""" + file_paths = [ + os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"), + os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"), + ] + + loader = UnstructuredAPIFileLoader( + file_path=file_paths, + api_key="FAKE_API_KEY", + strategy="fast", + mode="elements", + ) + docs = loader.load() + + assert len(docs) > 1 + + +def test_unstructured_api_file_io_loader() -> None: + """Test unstructured loader.""" + file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf") + + with open(file_path, "rb") as f: + loader = UnstructuredAPIFileIOLoader( + file=f, + api_key="FAKE_API_KEY", + strategy="fast", + mode="elements", + file_filename=file_path, + ) + docs = loader.load() + + assert len(docs) > 1 + + +def test_unstructured_api_file_loader_io_multiple_files() -> None: + """Test unstructured loader.""" + file_paths = [ + os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"), + os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"), + ] + + with ExitStack() as stack: + files = [stack.enter_context(open(file_path, "rb")) for file_path in file_paths] + + loader = UnstructuredAPIFileIOLoader( + file=files, # type: ignore + api_key="FAKE_API_KEY", + strategy="fast", + mode="elements", + file_filenames=file_paths, + ) + + docs = loader.load() + + assert len(docs) > 1