diff --git a/docs/ecosystem/unstructured.md b/docs/ecosystem/unstructured.md index 1133688a..6509c618 100644 --- a/docs/ecosystem/unstructured.md +++ b/docs/ecosystem/unstructured.md @@ -17,9 +17,12 @@ This page is broken into two parts: installation and setup, and then references - `poppler-utils` - `tesseract-ocr` - `libreoffice` -- If you are parsing PDFs, run the following to install the `detectron2` model, which +- If you are parsing PDFs using the `"hi_res"` strategy, run the following to install the `detectron2` model, which `unstructured` uses for layout detection: - `pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"` + - If `detectron2` is not installed, `unstructured` will fallback to processing PDFs + using the `"fast"` strategy, which uses `pdfminer` directly and doesn't require + `detectron2`. ## Wrappers diff --git a/docs/modules/document_loaders/examples/unstructured_file.ipynb b/docs/modules/document_loaders/examples/unstructured_file.ipynb index 6f173c7a..1db5f7ce 100644 --- a/docs/modules/document_loaders/examples/unstructured_file.ipynb +++ b/docs/modules/document_loaders/examples/unstructured_file.ipynb @@ -158,7 +158,72 @@ }, { "cell_type": "markdown", - "id": "7874d01d", + "id": "672733fd", + "metadata": {}, + "source": [ + "## Define a Partitioning Strategy\n", + "\n", + "Unstructured document loader allow users to pass in a `strategy` parameter that lets `unstructured` know how to partitioning the document. Currently supported strategies are `\"hi_res\"` (the default) and `\"fast\"`. Hi res partitioning strategies are more accurate, but take longer to process. Fast strategies partition the document more quickly, but trade-off accuracy. Not all document types have separate hi res and fast partitioning strategies. For those document types, the `strategy` kwarg is ignored. In some cases, the high res strategy will fallback to fast if there is a dependency missing (i.e. a model for document partitioning). You can see how to apply a strategy to an `UnstructuredFileLoader` below." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "767238a4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import UnstructuredFileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9518b425", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredFileLoader(\"layout-parser-paper-fast.pdf\", strategy=\"fast\", mode=\"elements\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "645f29e9", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "60685353", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='1', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", + " Document(page_content='2', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", + " Document(page_content='0', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", + " Document(page_content='2', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'UncategorizedText'}, lookup_index=0),\n", + " Document(page_content='n', lookup_str='', metadata={'source': 'layout-parser-paper-fast.pdf', 'filename': 'layout-parser-paper-fast.pdf', 'page_number': 1, 'category': 'Title'}, lookup_index=0)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "8de9ef16", "metadata": {}, "source": [ "## PDF Example\n", @@ -166,7 +231,6 @@ "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. " ] }, - { "cell_type": "code", "execution_count": 1, @@ -225,7 +289,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8ca8a648", + "id": "f52b04cb", "metadata": {}, "outputs": [], "source": [] @@ -247,7 +311,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/html.py b/langchain/document_loaders/html.py index 3d87f9fe..51784215 100644 --- a/langchain/document_loaders/html.py +++ b/langchain/document_loaders/html.py @@ -10,4 +10,4 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader): def _get_elements(self) -> List: from unstructured.partition.html import partition_html - return partition_html(filename=self.file_path) + return partition_html(filename=self.file_path, **self.unstructured_kwargs) diff --git a/langchain/document_loaders/image.py b/langchain/document_loaders/image.py index 668b211c..9732495d 100644 --- a/langchain/document_loaders/image.py +++ b/langchain/document_loaders/image.py @@ -10,4 +10,4 @@ class UnstructuredImageLoader(UnstructuredFileLoader): def _get_elements(self) -> List: from unstructured.partition.image import partition_image - return partition_image(filename=self.file_path) + return partition_image(filename=self.file_path, **self.unstructured_kwargs) diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 0fb6edde..b7e9cd1c 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -18,7 +18,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> List: from unstructured.partition.pdf import partition_pdf - return partition_pdf(filename=self.file_path) + return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) class BasePDFLoader(BaseLoader, ABC): diff --git a/langchain/document_loaders/powerpoint.py b/langchain/document_loaders/powerpoint.py index 9ed230b2..9c49be2a 100644 --- a/langchain/document_loaders/powerpoint.py +++ b/langchain/document_loaders/powerpoint.py @@ -36,8 +36,8 @@ class UnstructuredPowerPointLoader(UnstructuredFileLoader): if is_ppt: from unstructured.partition.ppt import partition_ppt - return partition_ppt(filename=self.file_path) + return partition_ppt(filename=self.file_path, **self.unstructured_kwargs) else: from unstructured.partition.pptx import partition_pptx - return partition_pptx(filename=self.file_path) + return partition_pptx(filename=self.file_path, **self.unstructured_kwargs) diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 614ff849..65c455e5 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -1,15 +1,32 @@ """Loader that uses unstructured to load files.""" from abc import ABC, abstractmethod -from typing import IO, List +from typing import IO, Any, List from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +def satisfies_min_unstructured_version(min_version: str) -> bool: + """Checks to see if the installed unstructured version exceeds the minimum version + for the feature in question.""" + from unstructured.__version__ import __version__ as __unstructured_version__ + + min_version_tuple = tuple([int(x) for x in min_version.split(".")]) + + # NOTE(MthwRobinson) - enables the loader to work when you're using pre-release + # versions of unstructured like 0.4.17-dev1 + _unstructured_version = __unstructured_version__.split("-")[0] + unstructured_version_tuple = tuple( + [int(x) for x in _unstructured_version.split(".")] + ) + + return unstructured_version_tuple >= min_version_tuple + + class UnstructuredBaseLoader(BaseLoader, ABC): """Loader that uses unstructured to load files.""" - def __init__(self, mode: str = "single"): + def __init__(self, mode: str = "single", **unstructured_kwargs: Any): """Initialize with file path.""" try: import unstructured # noqa:F401 @@ -25,6 +42,12 @@ class UnstructuredBaseLoader(BaseLoader, ABC): ) self.mode = mode + if not satisfies_min_unstructured_version("0.5.4"): + if "strategy" in unstructured_kwargs: + unstructured_kwargs.pop("strategy") + + self.unstructured_kwargs = unstructured_kwargs + @abstractmethod def _get_elements(self) -> List: """Get elements.""" @@ -59,15 +82,17 @@ class UnstructuredBaseLoader(BaseLoader, ABC): class UnstructuredFileLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load files.""" - def __init__(self, file_path: str, mode: str = "single"): + def __init__( + self, file_path: str, mode: str = "single", **unstructured_kwargs: Any + ): """Initialize with file path.""" self.file_path = file_path - super().__init__(mode=mode) + super().__init__(mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: from unstructured.partition.auto import partition - return partition(filename=self.file_path) + return partition(filename=self.file_path, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {"source": self.file_path} @@ -76,15 +101,15 @@ class UnstructuredFileLoader(UnstructuredBaseLoader): class UnstructuredFileIOLoader(UnstructuredBaseLoader): """Loader that uses unstructured to load file IO objects.""" - def __init__(self, file: IO, mode: str = "single"): + def __init__(self, file: IO, mode: str = "single", **unstructured_kwargs: Any): """Initialize with file path.""" self.file = file - super().__init__(mode=mode) + super().__init__(mode=mode, **unstructured_kwargs) def _get_elements(self) -> List: from unstructured.partition.auto import partition - return partition(file=self.file) + return partition(file=self.file, **self.unstructured_kwargs) def _get_metadata(self) -> dict: return {} diff --git a/langchain/document_loaders/word_document.py b/langchain/document_loaders/word_document.py index 139f4d30..dd990942 100644 --- a/langchain/document_loaders/word_document.py +++ b/langchain/document_loaders/word_document.py @@ -36,8 +36,8 @@ class UnstructuredWordDocumentLoader(UnstructuredFileLoader): if is_doc: from unstructured.partition.doc import partition_doc - return partition_doc(filename=self.file_path) + return partition_doc(filename=self.file_path, **self.unstructured_kwargs) else: from unstructured.partition.docx import partition_docx - return partition_docx(filename=self.file_path) + return partition_docx(filename=self.file_path, **self.unstructured_kwargs)