diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb index 33e4bd91dc..5a56752d07 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/unstructured_file.ipynb @@ -226,13 +226,17 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8de9ef16", "metadata": {}, "source": [ "## PDF Example\n", "\n", - "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. " + "Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n", + "- `single` all the text from all elements are combined into one (default)\n", + "- `elements` maintain individual elements\n", + "- `paged` texts from each page are only combined" ] }, { diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 3e7e599d67..36ac8fc876 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -1,7 +1,7 @@ """Loader that uses unstructured to load files.""" import collections from abc import ABC, abstractmethod -from typing import IO, Any, List, Sequence, Union +from typing import IO, Any, Dict, List, Sequence, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -45,7 +45,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC): "unstructured package not found, please install it with " "`pip install unstructured`" ) - _valid_modes = {"single", "elements"} + _valid_modes = {"single", "elements", "paged"} if mode not in _valid_modes: raise ValueError( f"Got {mode} for `mode`, but should be one of `{_valid_modes}`" @@ -80,6 +80,31 @@ class UnstructuredBaseLoader(BaseLoader, ABC): if hasattr(element, "category"): metadata["category"] = element.category docs.append(Document(page_content=str(element), metadata=metadata)) + elif self.mode == "paged": + text_dict: Dict[int, str] = {} + meta_dict: Dict[int, Dict] = {} + + for idx, element in enumerate(elements): + metadata = self._get_metadata() + if hasattr(element, "metadata"): + metadata.update(element.metadata.to_dict()) + page_number = metadata.get("page_number", 1) + + # Check if this page_number already exists in docs_dict + if page_number not in text_dict: + # If not, create new entry with initial text and metadata + text_dict[page_number] = str(element) + "\n\n" + meta_dict[page_number] = metadata + else: + # If exists, append to text and update the metadata + text_dict[page_number] += str(element) + "\n\n" + meta_dict[page_number].update(metadata) + + # Convert the dict to a list of Document objects + docs = [ + Document(page_content=text_dict[key], metadata=meta_dict[key]) + for key in text_dict.keys() + ] elif self.mode == "single": metadata = self._get_metadata() text = "\n\n".join([str(el) for el in elements]) diff --git a/tests/integration_tests/document_loaders/test_pdf.py b/tests/integration_tests/document_loaders/test_pdf.py index a5bc8cf1dd..324a2e0212 100644 --- a/tests/integration_tests/document_loaders/test_pdf.py +++ b/tests/integration_tests/document_loaders/test_pdf.py @@ -11,7 +11,25 @@ from langchain.document_loaders import ( ) -def test_unstructured_pdf_loader() -> None: +def test_unstructured_pdf_loader_elements_mode() -> None: + """Test unstructured loader with various modes.""" + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = UnstructuredPDFLoader(str(file_path), mode="elements") + docs = loader.load() + + assert len(docs) == 2 + + +def test_unstructured_pdf_loader_paged_mode() -> None: + """Test unstructured loader with various modes.""" + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = UnstructuredPDFLoader(str(file_path), mode="paged") + docs = loader.load() + + assert len(docs) == 16 + + +def test_unstructured_pdf_loader_default_mode() -> None: """Test unstructured loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" loader = UnstructuredPDFLoader(str(file_path))