Harrison/unstructured page number (#6464)

Co-authored-by: Reza Sanaie <reza@sanaie.ca>
11 months ago · 9eec7c3206
parent b82ddf9cfb
commit 9eec7c3206
3 changed files with 51 additions and 4 deletions
--- a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb
@ -226,13 +226,17 @@
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "id": "8de9ef16",
   "metadata": {},
   "source": [
    "## PDF Example\n",
    "\n",
-    "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
+    "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
+    "- `single` all the text from all elements are combined into one (default)\n",
+    "- `elements` maintain individual elements\n",
+    "- `paged` texts from each page are only combined"
   ]
  },
  {
--- a/langchain/document_loaders/unstructured.py
+++ b/langchain/document_loaders/unstructured.py
@ -1,7 +1,7 @@
 """Loader that uses unstructured to load files."""
 import collections
 from abc import ABC, abstractmethod
-from typing import IO, Any, List, Sequence, Union
+from typing import IO, Any, Dict, List, Sequence, Union

 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
@ -45,7 +45,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )
-        _valid_modes = {"single", "elements"}
+        _valid_modes = {"single", "elements", "paged"}
        if mode not in _valid_modes:
            raise ValueError(
                f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
@ -80,6 +80,31 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
                if hasattr(element, "category"):
                    metadata["category"] = element.category
                docs.append(Document(page_content=str(element), metadata=metadata))
+        elif self.mode == "paged":
+            text_dict: Dict[int, str] = {}
+            meta_dict: Dict[int, Dict] = {}
+
+            for idx, element in enumerate(elements):
+                metadata = self._get_metadata()
+                if hasattr(element, "metadata"):
+                    metadata.update(element.metadata.to_dict())
+                page_number = metadata.get("page_number", 1)
+
+                # Check if this page_number already exists in docs_dict
+                if page_number not in text_dict:
+                    # If not, create new entry with initial text and metadata
+                    text_dict[page_number] = str(element) + "\n\n"
+                    meta_dict[page_number] = metadata
+                else:
+                    # If exists, append to text and update the metadata
+                    text_dict[page_number] += str(element) + "\n\n"
+                    meta_dict[page_number].update(metadata)
+
+            # Convert the dict to a list of Document objects
+            docs = [
+                Document(page_content=text_dict[key], metadata=meta_dict[key])
+                for key in text_dict.keys()
+            ]
        elif self.mode == "single":
            metadata = self._get_metadata()
            text = "\n\n".join([str(el) for el in elements])
--- a/tests/integration_tests/document_loaders/test_pdf.py
+++ b/tests/integration_tests/document_loaders/test_pdf.py
@ -11,7 +11,25 @@ from langchain.document_loaders import (
 )


-def test_unstructured_pdf_loader() -> None:
+def test_unstructured_pdf_loader_elements_mode() -> None:
+    """Test unstructured loader with various modes."""
+    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
+    loader = UnstructuredPDFLoader(str(file_path), mode="elements")
+    docs = loader.load()
+
+    assert len(docs) == 2
+
+
+def test_unstructured_pdf_loader_paged_mode() -> None:
+    """Test unstructured loader with various modes."""
+    file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
+    loader = UnstructuredPDFLoader(str(file_path), mode="paged")
+    docs = loader.load()
+
+    assert len(docs) == 16
+
+
+def test_unstructured_pdf_loader_default_mode() -> None:
    """Test unstructured loader."""
    file_path = Path(__file__).parent.parent / "examples/hello.pdf"
    loader = UnstructuredPDFLoader(str(file_path))