docs: update unstructured docstrings (#7561)

### Summary Updates the docstrings in the Unstructured document loaders to display more useful information on the integrations page.
1 year ago · c4d53f98dc
parent 2c2f0e15a6
commit c4d53f98dc
14 changed files with 278 additions and 21 deletions
--- a/langchain/document_loaders/epub.py
+++ b/langchain/document_loaders/epub.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredEPubLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load epub files."""
+    """UnstructuredEPubLoader uses unstructured to load EPUB files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredEPubLoader
+
+    loader = UnstructuredEPubLoader(
+        "example.epub", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-epub
+    """

    def _get_elements(self) -> List:
        min_unstructured_version = "0.5.4"
--- a/langchain/document_loaders/html.py
+++ b/langchain/document_loaders/html.py
@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class UnstructuredHTMLLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load HTML files."""
+    """UnstructuredHTMLLoader uses unstructured to load HTML files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredHTMLLoader
+
+    loader = UnstructuredHTMLLoader(
+        "example.html", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-html
+    """

    def _get_elements(self) -> List:
        from unstructured.partition.html import partition_html
--- a/langchain/document_loaders/image.py
+++ b/langchain/document_loaders/image.py
@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class UnstructuredImageLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load image files, such as PNGs and JPGs."""
+    """UnstructuredImageLoader uses unstructured to load PNG and JPG files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredImageLoader
+
+    loader = UnstructuredImageLoader(
+        "example.png", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-image
+    """

    def _get_elements(self) -> List:
        from unstructured.partition.image import partition_image
--- a/langchain/document_loaders/markdown.py
+++ b/langchain/document_loaders/markdown.py
@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class UnstructuredMarkdownLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load markdown files."""
+    """UnstructuredMarkdownLoader uses unstructured to load markdown files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredMarkdownLoader
+
+    loader = UnstructuredMarkdownLoader(
+        "example.md", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-md
+    """

    def _get_elements(self) -> List:
        from unstructured.__version__ import __version__ as __unstructured_version__
--- a/langchain/document_loaders/odt.py
+++ b/langchain/document_loaders/odt.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredODTLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load OpenOffice ODT files."""
+    """Loader that uses unstructured to load OpenOffice ODT files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredODTLoader
+
+    loader = UnstructuredODTLoader(
+        "example.odt", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-odt
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/org_mode.py
+++ b/langchain/document_loaders/org_mode.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredOrgModeLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load Org-Mode files."""
+    """Loader that uses unstructured to load Org-Mode files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredOrgModeLoader
+
+    loader = UnstructuredOrgModeLoader(
+        "example.org", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-org
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -29,7 +29,27 @@ logger = logging.getLogger(__file__)


 class UnstructuredPDFLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load PDF files."""
+    """Loader that uses unstructured to load PDF files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredPDFLoader
+
+    loader = UnstructuredPDFLoader(
+        "example.pdf", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf
+    """

    def _get_elements(self) -> List:
        from unstructured.partition.pdf import partition_pdf
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -6,7 +6,28 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class UnstructuredPowerPointLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load PowerPoint files."""
+    """Loader that uses unstructured to load PowerPoint files.
+    Works with both .ppt and .pptx files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredPowerPointLoader
+
+    loader = UnstructuredPowerPointLoader(
+        "example.pptx", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx
+    """

    def _get_elements(self) -> List:
        from unstructured.__version__ import __version__ as __unstructured_version__
--- a/langchain/document_loaders/rst.py
+++ b/langchain/document_loaders/rst.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredRSTLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load RST files."""
+    """Loader that uses unstructured to load RST files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredRSTLoader
+
+    loader = UnstructuredRSTLoader(
+        "example.rst", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-rst
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/rtf.py
+++ b/langchain/document_loaders/rtf.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredRTFLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load rtf files."""
+    """Loader that uses unstructured to load RTF files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredRTFLoader
+
+    loader = UnstructuredRTFLoader(
+        "example.rtf", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-rtf
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -126,14 +126,12 @@ class UnstructuredFileLoader(UnstructuredBaseLoader):

    Examples
    --------
-    ```python
    from langchain.document_loaders import UnstructuredFileLoader

    loader = UnstructuredFileLoader(
        "example.pdf", mode="elements", strategy="fast",
    )
    docs = loader.load()
-    ```

    References
    ----------
@ -211,7 +209,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
    different unstructured settings.

    Examples
-    --------
    ```python
    from langchain.document_loaders import UnstructuredAPIFileLoader

@ -219,7 +216,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader):
        "example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY",
    )
    docs = loader.load()
-    ```

    References
    ----------
@ -272,7 +268,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):

    Examples
    --------
-    ```python
    from langchain.document_loaders import UnstructuredFileIOLoader

    with open("example.pdf", "rb") as f:
@ -280,7 +275,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader):
            f, mode="elements", strategy="fast",
        )
        docs = loader.load()
-    ```


    References
@ -324,7 +318,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):

    Examples
    --------
-    ```python
    from langchain.document_loaders import UnstructuredAPIFileLoader

    with open("example.pdf", "rb") as f:
@ -332,7 +325,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader):
            f, mode="elements", strategy="fast", api_key="MY_API_KEY",
        )
        docs = loader.load()
-    ```

    References
    ----------
--- a/langchain/document_loaders/url.py
+++ b/langchain/document_loaders/url.py
@ -9,7 +9,30 @@ logger = logging.getLogger(__name__)


 class UnstructuredURLLoader(BaseLoader):
-    """Loader that uses unstructured to load HTML files."""
+    """Loader that use Unstructured to load files from remote URLs.
+    Use the unstructured partition function to detect the MIME type
+    and route the file to the appropriate partitioner.
+
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredURLLoader
+
+    loader = UnstructuredURLLoader(
+        ursl=["<url-1>", "<url-2>"], mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition
+    """

    def __init__(
        self,
--- a/langchain/document_loaders/word_document.py
+++ b/langchain/document_loaders/word_document.py
@ -65,7 +65,28 @@ class Docx2txtLoader(BaseLoader, ABC):


 class UnstructuredWordDocumentLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load word documents."""
+    """Loader that uses unstructured to load word documents.
+    Works with both .docx and .doc files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredWordDocumentLoader
+
+    loader = UnstructuredWordDocumentLoader(
+        "example.docx", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-docx
+    """

    def _get_elements(self) -> List:
        from unstructured.__version__ import __version__ as __unstructured_version__
--- a/langchain/document_loaders/xml.py
+++ b/langchain/document_loaders/xml.py
@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredXMLLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load XML files."""
+    """Loader that uses unstructured to load XML files.
+    You can run the loader in one of two modes: "single" and "elements".
+    If you use "single" mode, the document will be returned as a single
+    langchain Document object. If you use "elements" mode, the unstructured
+    library will split the document into elements such as Title and NarrativeText.
+    You can pass in additional unstructured kwargs after mode to apply
+    different unstructured settings.
+
+    Examples
+    --------
+    from langchain.document_loaders import UnstructuredXMLLoader
+
+    loader = UnstructuredXMLLoader(
+        "example.xml", mode="elements", strategy="fast",
+    )
+    docs = loader.load()
+
+    References
+    ----------
+    https://unstructured-io.github.io/unstructured/bricks.html#partition-xml
+    """

    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any