diff --git a/langchain/document_loaders/epub.py b/langchain/document_loaders/epub.py index f99f0a9d7b..05dfaea103 100644 --- a/langchain/document_loaders/epub.py +++ b/langchain/document_loaders/epub.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredEPubLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load epub files.""" + """UnstructuredEPubLoader uses unstructured to load EPUB files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredEPubLoader + + loader = UnstructuredEPubLoader( + "example.epub", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-epub + """ def _get_elements(self) -> List: min_unstructured_version = "0.5.4" diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py index 517842159e..940d879960 100644 --- a/langchain/document_loaders/html.py +++ b/langchain/document_loaders/html.py @@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredHTMLLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load HTML files.""" + """UnstructuredHTMLLoader uses unstructured to load HTML files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredHTMLLoader + + loader = UnstructuredHTMLLoader( + "example.html", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-html + """ def _get_elements(self) -> List: from unstructured.partition.html import partition_html diff --git a/langchain/document_loaders/image.py b/langchain/document_loaders/image.py index 6954d04dea..b47339a228 100644 --- a/langchain/document_loaders/image.py +++ b/langchain/document_loaders/image.py @@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredImageLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load image files, such as PNGs and JPGs.""" + """UnstructuredImageLoader uses unstructured to load PNG and JPG files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredImageLoader + + loader = UnstructuredImageLoader( + "example.png", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-image + """ def _get_elements(self) -> List: from unstructured.partition.image import partition_image diff --git a/langchain/document_loaders/markdown.py b/langchain/document_loaders/markdown.py index 3ecad43905..b091208287 100644 --- a/langchain/document_loaders/markdown.py +++ b/langchain/document_loaders/markdown.py @@ -5,7 +5,27 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredMarkdownLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load markdown files.""" + """UnstructuredMarkdownLoader uses unstructured to load markdown files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredMarkdownLoader + + loader = UnstructuredMarkdownLoader( + "example.md", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-md + """ def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ diff --git a/langchain/document_loaders/odt.py b/langchain/document_loaders/odt.py index 63685e10f1..a8641b6562 100644 --- a/langchain/document_loaders/odt.py +++ b/langchain/document_loaders/odt.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredODTLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load OpenOffice ODT files.""" + """Loader that uses unstructured to load OpenOffice ODT files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredODTLoader + + loader = UnstructuredODTLoader( + "example.odt", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-odt + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/org_mode.py b/langchain/document_loaders/org_mode.py index 022cfca02b..dbb38411fd 100644 --- a/langchain/document_loaders/org_mode.py +++ b/langchain/document_loaders/org_mode.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredOrgModeLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load Org-Mode files.""" + """Loader that uses unstructured to load Org-Mode files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredOrgModeLoader + + loader = UnstructuredOrgModeLoader( + "example.org", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-org + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index f8a5a3bcc9..5e5bacd578 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -29,7 +29,27 @@ logger = logging.getLogger(__file__) class UnstructuredPDFLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load PDF files.""" + """Loader that uses unstructured to load PDF files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredPDFLoader + + loader = UnstructuredPDFLoader( + "example.pdf", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-pdf + """ def _get_elements(self) -> List: from unstructured.partition.pdf import partition_pdf diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index 7a00501e67..d9c3e7f84d 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -6,7 +6,28 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class UnstructuredPowerPointLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load PowerPoint files.""" + """Loader that uses unstructured to load PowerPoint files. + Works with both .ppt and .pptx files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredPowerPointLoader + + loader = UnstructuredPowerPointLoader( + "example.pptx", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-pptx + """ def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ diff --git a/langchain/document_loaders/rst.py b/langchain/document_loaders/rst.py index abff302227..a0c0095ec5 100644 --- a/langchain/document_loaders/rst.py +++ b/langchain/document_loaders/rst.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredRSTLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load RST files.""" + """Loader that uses unstructured to load RST files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredRSTLoader + + loader = UnstructuredRSTLoader( + "example.rst", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-rst + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/rtf.py b/langchain/document_loaders/rtf.py index 3536cd3791..1cc7b4674d 100644 --- a/langchain/document_loaders/rtf.py +++ b/langchain/document_loaders/rtf.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredRTFLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load rtf files.""" + """Loader that uses unstructured to load RTF files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredRTFLoader + + loader = UnstructuredRTFLoader( + "example.rtf", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-rtf + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 4cedf84d04..add99f143b 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -126,14 +126,12 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): Examples -------- - ```python from langchain.document_loaders import UnstructuredFileLoader loader = UnstructuredFileLoader( "example.pdf", mode="elements", strategy="fast", ) docs = loader.load() - ``` References ---------- @@ -211,7 +209,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader): different unstructured settings. Examples - -------- ```python from langchain.document_loaders import UnstructuredAPIFileLoader @@ -219,7 +216,6 @@ class UnstructuredAPIFileLoader(UnstructuredFileLoader): "example.pdf", mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() - ``` References ---------- @@ -272,7 +268,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader): Examples -------- - ```python from langchain.document_loaders import UnstructuredFileIOLoader with open("example.pdf", "rb") as f: @@ -280,7 +275,6 @@ class UnstructuredFileIOLoader(UnstructuredBaseLoader): f, mode="elements", strategy="fast", ) docs = loader.load() - ``` References @@ -324,7 +318,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): Examples -------- - ```python from langchain.document_loaders import UnstructuredAPIFileLoader with open("example.pdf", "rb") as f: @@ -332,7 +325,6 @@ class UnstructuredAPIFileIOLoader(UnstructuredFileIOLoader): f, mode="elements", strategy="fast", api_key="MY_API_KEY", ) docs = loader.load() - ``` References ---------- diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py index 3007fe9954..de985d4741 100644 --- a/langchain/document_loaders/url.py +++ b/langchain/document_loaders/url.py @@ -9,7 +9,30 @@ logger = logging.getLogger(__name__) class UnstructuredURLLoader(BaseLoader): - """Loader that uses unstructured to load HTML files.""" + """Loader that use Unstructured to load files from remote URLs. + Use the unstructured partition function to detect the MIME type + and route the file to the appropriate partitioner. + + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredURLLoader + + loader = UnstructuredURLLoader( + ursl=["", ""], mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition + """ def __init__( self, diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py index 3ee91ab741..66ccfbd090 100644 --- a/langchain/document_loaders/word_document.py +++ b/langchain/document_loaders/word_document.py @@ -65,7 +65,28 @@ class Docx2txtLoader(BaseLoader, ABC): class UnstructuredWordDocumentLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load word documents.""" + """Loader that uses unstructured to load word documents. + Works with both .docx and .doc files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredWordDocumentLoader + + loader = UnstructuredWordDocumentLoader( + "example.docx", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-docx + """ def _get_elements(self) -> List: from unstructured.__version__ import __version__ as __unstructured_version__ diff --git a/langchain/document_loaders/xml.py b/langchain/document_loaders/xml.py index 4239a49396..ac6ceeea9a 100644 --- a/langchain/document_loaders/xml.py +++ b/langchain/document_loaders/xml.py @@ -8,7 +8,27 @@ from langchain.document_loaders.unstructured import ( class UnstructuredXMLLoader(UnstructuredFileLoader): - """Loader that uses unstructured to load XML files.""" + """Loader that uses unstructured to load XML files. + You can run the loader in one of two modes: "single" and "elements". + If you use "single" mode, the document will be returned as a single + langchain Document object. If you use "elements" mode, the unstructured + library will split the document into elements such as Title and NarrativeText. + You can pass in additional unstructured kwargs after mode to apply + different unstructured settings. + + Examples + -------- + from langchain.document_loaders import UnstructuredXMLLoader + + loader = UnstructuredXMLLoader( + "example.xml", mode="elements", strategy="fast", + ) + docs = loader.load() + + References + ---------- + https://unstructured-io.github.io/unstructured/bricks.html#partition-xml + """ def __init__( self, file_path: str, mode: str = "single", **unstructured_kwargs: Any