mirror of https://github.com/hwchase17/langchain
feat: batch multiple files in a single Unstructured API request (#4525)
### Submit Multiple Files to the Unstructured API Enables batching multiple files into a single Unstructured API requests. Support for requests with multiple files was added to both `UnstructuredAPIFileLoader` and `UnstructuredAPIFileIOLoader`. Note that if you submit multiple files in "single" mode, the result will be concatenated into a single document. We recommend using this feature in "elements" mode. ### Testing The following should load both documents, using two of the example docs from the integration tests folder. ```python from langchain.document_loaders import UnstructuredAPIFileLoader file_paths = ["examples/layout-parser-paper.pdf", "examples/whatsapp_chat.txt"] loader = UnstructuredAPIFileLoader( file_paths=file_paths, api_key="FAKE_API_KEY", strategy="fast", mode="elements", ) docs = loader.load() ```pull/4881/head^2
parent
0c3de0a0b3
commit
bf3f554357
@ -0,0 +1,82 @@
|
||||
import os
|
||||
from contextlib import ExitStack
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import (
|
||||
UnstructuredAPIFileIOLoader,
|
||||
UnstructuredAPIFileLoader,
|
||||
)
|
||||
|
||||
EXAMPLE_DOCS_DIRECTORY = str(Path(__file__).parent.parent / "examples/")
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
file_path=file_path,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader_multiple_files() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_paths = [
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
|
||||
]
|
||||
|
||||
loader = UnstructuredAPIFileLoader(
|
||||
file_path=file_paths,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_io_loader() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_path = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf")
|
||||
|
||||
with open(file_path, "rb") as f:
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
file=f,
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
file_filename=file_path,
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
||||
|
||||
|
||||
def test_unstructured_api_file_loader_io_multiple_files() -> None:
|
||||
"""Test unstructured loader."""
|
||||
file_paths = [
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper.pdf"),
|
||||
os.path.join(EXAMPLE_DOCS_DIRECTORY, "whatsapp_chat.txt"),
|
||||
]
|
||||
|
||||
with ExitStack() as stack:
|
||||
files = [stack.enter_context(open(file_path, "rb")) for file_path in file_paths]
|
||||
|
||||
loader = UnstructuredAPIFileIOLoader(
|
||||
file=files, # type: ignore
|
||||
api_key="FAKE_API_KEY",
|
||||
strategy="fast",
|
||||
mode="elements",
|
||||
file_filenames=file_paths,
|
||||
)
|
||||
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) > 1
|
Loading…
Reference in New Issue