docs: update unstructured docstrings (#7561)

### Summary

Updates the docstrings in the Unstructured document loaders to display
more useful information on the integrations page.
This commit is contained in:
Matt Robinson 2023-07-11 17:12:05 -04:00 committed by GitHub
parent 2c2f0e15a6
commit c4d53f98dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 278 additions and 21 deletions

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredEPubLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load epub files."""
"""UnstructuredEPubLoader uses unstructured to load EPUB files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredEPubLoader
loader = UnstructuredEPubLoader(
"example.epub", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-epub
"""
def _get_elements(self) -> List:
min_unstructured_version = "0.5.4"

View File

@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredHTMLLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load HTML files."""
"""UnstructuredHTMLLoader uses unstructured to load HTML files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredHTMLLoader
loader = UnstructuredHTMLLoader(
"example.html", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-html
"""
def _get_elements(self) -> List:
from unstructured.partition.html import partition_html

View File

@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredImageLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
"""UnstructuredImageLoader uses unstructured to load PNG and JPG files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredImageLoader
loader = UnstructuredImageLoader(
"example.png", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-image
"""
def _get_elements(self) -> List:
from unstructured.partition.image import partition_image

View File

@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredMarkdownLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load markdown files."""
"""UnstructuredMarkdownLoader uses unstructured to load markdown files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader(
"example.md", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-md
"""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredODTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load OpenOffice ODT files."""
"""Loader that uses unstructured to load OpenOffice ODT files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredODTLoader
loader = UnstructuredODTLoader(
"example.odt", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-odt
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredOrgModeLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load Org-Mode files."""
"""Loader that uses unstructured to load Org-Mode files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredOrgModeLoader
loader = UnstructuredOrgModeLoader(
"example.org", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-org
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

View File

@ -29,7 +29,27 @@ logger = logging.getLogger(__file__)
class UnstructuredPDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PDF files."""
"""Loader that uses unstructured to load PDF files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredPDFLoader
loader = UnstructuredPDFLoader(
"example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
"""
def _get_elements(self) -> List:
from unstructured.partition.pdf import partition_pdf

View File

@ -6,7 +6,28 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PowerPoint files."""
"""Loader that uses unstructured to load PowerPoint files.
Works with both .ppt and .pptx files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredPowerPointLoader
loader = UnstructuredPowerPointLoader(
"example.pptx", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx
"""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredRSTLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load RST files."""
"""Loader that uses unstructured to load RST files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredRSTLoader
loader = UnstructuredRSTLoader(
"example.rst", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-rst
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredRTFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load rtf files."""
"""Loader that uses unstructured to load RTF files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredRTFLoader
loader = UnstructuredRTFLoader(
"example.rtf", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-rtf
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any

View File

@ -126,14 +126,12 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):
Examples
--------
```python
from langchain.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader(
"example.pdf", mode="elements", strategy="fast",
)
docs = loader.load()
```
References
----------
@ -211,7 +209,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
different unstructured settings.
Examples
--------
```python
from langchain.document_loaders import UnstructuredAPIFileLoader
@ -219,7 +216,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
"example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
```
References
----------
@ -272,7 +268,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
Examples
--------
```python
from langchain.document_loaders import UnstructuredFileIOLoader
with open("example.pdf", "rb") as f:
@ -280,7 +275,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
f, mode="elements", strategy="fast",
)
docs = loader.load()
```
References
@ -324,7 +318,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
Examples
--------
```python
from langchain.document_loaders import UnstructuredAPIFileLoader
with open("example.pdf", "rb") as f:
@ -332,7 +325,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
f, mode="elements", strategy="fast", api_key="MY_API_KEY",
)
docs = loader.load()
```
References
----------

View File

@ -9,7 +9,30 @@ logger = logging.getLogger(__name__)
class UnstructuredURLLoader(BaseLoader):
"""Loader that uses unstructured to load HTML files."""
"""Loader that use Unstructured to load files from remote URLs.
Use the unstructured partition function to detect the MIME type
and route the file to the appropriate partitioner.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(
ursl=["<url-1>", "<url-2>"], mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition
"""
def __init__(
self,

View File

@ -65,7 +65,28 @@ class Docx2txtLoader(BaseLoader, ABC):
class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load word documents."""
"""Loader that uses unstructured to load word documents.
Works with both .docx and .doc files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredWordDocumentLoader
loader = UnstructuredWordDocumentLoader(
"example.docx", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
"""
def _get_elements(self) -> List:
from unstructured.__version__ import __version__ as __unstructured_version__

View File

@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (
class UnstructuredXMLLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load XML files."""
"""Loader that uses unstructured to load XML files.
You can run the loader in one of two modes: "single" and "elements".
If you use "single" mode, the document will be returned as a single
langchain Document object. If you use "elements" mode, the unstructured
library will split the document into elements such as Title and NarrativeText.
You can pass in additional unstructured kwargs after mode to apply
different unstructured settings.
Examples
--------
from langchain.document_loaders import UnstructuredXMLLoader
loader = UnstructuredXMLLoader(
"example.xml", mode="elements", strategy="fast",
)
docs = loader.load()
References
----------
https://unstructured-io.github.io/unstructured/bricks.html#partition-xml
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any