Harrison/unstructured page number (#6464)

Co-authored-by: Reza Sanaie <reza@sanaie.ca>
master
Harrison Chase 11 months ago committed by GitHub
parent b82ddf9cfb
commit 9eec7c3206
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -226,13 +226,17 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "8de9ef16",
"metadata": {},
"source": [
"## PDF Example\n",
"\n",
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of `elements`. "
"Processing PDF documents works exactly the same way. Unstructured detects the file type and extracts the same types of elements. Modes of operation are \n",
"- `single` all the text from all elements are combined into one (default)\n",
"- `elements` maintain individual elements\n",
"- `paged` texts from each page are only combined"
]
},
{

@ -1,7 +1,7 @@
"""Loader that uses unstructured to load files."""
import collections
from abc import ABC, abstractmethod
from typing import IO, Any, List, Sequence, Union
from typing import IO, Any, Dict, List, Sequence, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
@ -45,7 +45,7 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
_valid_modes = {"single", "elements"}
_valid_modes = {"single", "elements", "paged"}
if mode not in _valid_modes:
raise ValueError(
f"Got {mode} for `mode`, but should be one of `{_valid_modes}`"
@ -80,6 +80,31 @@ class UnstructuredBaseLoader(BaseLoader, ABC):
if hasattr(element, "category"):
metadata["category"] = element.category
docs.append(Document(page_content=str(element), metadata=metadata))
elif self.mode == "paged":
text_dict: Dict[int, str] = {}
meta_dict: Dict[int, Dict] = {}
for idx, element in enumerate(elements):
metadata = self._get_metadata()
if hasattr(element, "metadata"):
metadata.update(element.metadata.to_dict())
page_number = metadata.get("page_number", 1)
# Check if this page_number already exists in docs_dict
if page_number not in text_dict:
# If not, create new entry with initial text and metadata
text_dict[page_number] = str(element) + "\n\n"
meta_dict[page_number] = metadata
else:
# If exists, append to text and update the metadata
text_dict[page_number] += str(element) + "\n\n"
meta_dict[page_number].update(metadata)
# Convert the dict to a list of Document objects
docs = [
Document(page_content=text_dict[key], metadata=meta_dict[key])
for key in text_dict.keys()
]
elif self.mode == "single":
metadata = self._get_metadata()
text = "\n\n".join([str(el) for el in elements])

@ -11,7 +11,25 @@ from langchain.document_loaders import (
)
def test_unstructured_pdf_loader() -> None:
def test_unstructured_pdf_loader_elements_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="elements")
docs = loader.load()
assert len(docs) == 2
def test_unstructured_pdf_loader_paged_mode() -> None:
"""Test unstructured loader with various modes."""
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = UnstructuredPDFLoader(str(file_path), mode="paged")
docs = loader.load()
assert len(docs) == 16
def test_unstructured_pdf_loader_default_mode() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = UnstructuredPDFLoader(str(file_path))

Loading…
Cancel
Save