feat: add Unstructured API loaders (#3906)

### Summary

Adds `UnstructuredAPIFileLoaders` and `UnstructuredAPIFIleIOLoaders`
that partition documents through the Unstructured API. Defaults to the
URL for hosted Unstructured API, but can switch to a self hosted or
locally running API using the `url` kwarg. Currently, the Unstructured
API is open and does not require an API, but it will soon. A note was
added about that to the Unstructured ecosystem page.

### Testing


```python
from langchain.document_loaders import UnstructuredAPIFileIOLoader

filename = "fake-email.eml"

with open(filename, "rb") as f:
    loader = UnstructuredAPIFileIOLoader(file=f, file_filename=filename)
    docs = loader.load()

docs[0]
```

```python
from langchain.document_loaders import UnstructuredAPIFileLoader

filename = "fake-email.eml"
loader = UnstructuredAPIFileLoader(file_path=filename, mode="elements")
docs = loader.load()

docs[0]
```
This commit is contained in:
Matt Robinson 2023-05-01 23:37:35 -04:00 committed by GitHub
parent 13269fb583
commit c51dec5101
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 88 additions and 0 deletions

View File

@ -10,6 +10,10 @@ This page is broken into two parts: installation and setup, and then references
`unstructured` wrappers. `unstructured` wrappers.
## Installation and Setup ## Installation and Setup
If you are using a loader that runs locally, use the following steps to get `unstructured` and
its dependencies running locally.
- Install the Python SDK with `pip install "unstructured[local-inference]"` - Install the Python SDK with `pip install "unstructured[local-inference]"`
- Install the following system dependencies if they are not already available on your system. - Install the following system dependencies if they are not already available on your system.
Depending on what document types you're parsing, you may not need all of these. Depending on what document types you're parsing, you may not need all of these.
@ -25,6 +29,15 @@ This page is broken into two parts: installation and setup, and then references
using the `"fast"` strategy, which uses `pdfminer` directly and doesn't require using the `"fast"` strategy, which uses `pdfminer` directly and doesn't require
`detectron2`. `detectron2`.
If you want to get up and running with less set up, you can
simply run `pip install unstructured` and use `UnstructuredAPIFileLoader` or
`UnstructuredAPIFileIOLoader`. That will process your document using the hosted Unstructured API.
Note that currently (as of 1 May 2023) the Unstructured API is open, but it will soon require
an API. The [Unstructured documentation page](https://unstructured-io.github.io/) will have
instructions on how to generate an API key once they're available. Check out the instructions
[here](https://github.com/Unstructured-IO/unstructured-api#dizzy-instructions-for-using-the-docker-image)
if you'd like to self-host the Unstructured API or run it locally.
## Wrappers ## Wrappers
### Data Loaders ### Data Loaders

View File

@ -77,6 +77,8 @@ from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.twitter import TwitterTweetLoader
from langchain.document_loaders.unstructured import ( from langchain.document_loaders.unstructured import (
UnstructuredAPIFileIOLoader,
UnstructuredAPIFileLoader,
UnstructuredFileIOLoader, UnstructuredFileIOLoader,
UnstructuredFileLoader, UnstructuredFileLoader,
) )
@ -164,7 +166,9 @@ __all__ = [
"TwitterTweetLoader", "TwitterTweetLoader",
"UnstructuredEPubLoader", "UnstructuredEPubLoader",
"UnstructuredEmailLoader", "UnstructuredEmailLoader",
"UnstructuredAPIFileIOLoader",
"UnstructuredFileIOLoader", "UnstructuredFileIOLoader",
"UnstructuredAPIFileLoader",
"UnstructuredFileLoader", "UnstructuredFileLoader",
"UnstructuredHTMLLoader", "UnstructuredHTMLLoader",
"UnstructuredImageLoader", "UnstructuredImageLoader",

View File

@ -98,6 +98,42 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
return {"source": self.file_path} return {"source": self.file_path}
class UnstructuredAPIFileLoader(UnstructuredFileLoader):
"""Loader that uses the unstructured web API to load files."""
def __init__(
self,
file_path: str,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
min_unstructured_version = "0.6.2"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning via API is only supported in "
f"unstructured>={min_unstructured_version}."
)
self.url = url
self.api_key = api_key
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.api import partition_via_api
return partition_via_api(
filename=self.file_path,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)
class UnstructuredFileIOLoader(UnstructuredBaseLoader): class UnstructuredFileIOLoader(UnstructuredBaseLoader):
"""Loader that uses unstructured to load file IO objects.""" """Loader that uses unstructured to load file IO objects."""
@ -113,3 +149,38 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
def _get_metadata(self) -> dict: def _get_metadata(self) -> dict:
return {} return {}
class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
"""Loader that uses the unstructured web API to load file IO objects."""
def __init__(
self,
file: IO,
mode: str = "single",
url: str = "https://api.unstructured.io/general/v0/general",
api_key: str = "",
**unstructured_kwargs: Any,
):
"""Initialize with file path."""
min_unstructured_version = "0.6.2"
if not satisfies_min_unstructured_version(min_unstructured_version):
raise ValueError(
"Partitioning via API is only supported in "
f"unstructured>={min_unstructured_version}."
)
self.url = url
self.api_key = api_key
super().__init__(file=file, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.api import partition_via_api
return partition_via_api(
file=self.file,
api_key=self.api_key,
api_url=self.url,
**self.unstructured_kwargs,
)