feat: Add page metadata on PDFMinerLoader (#12277)

- **Description:** #12273 's suggestion PR
Like other PDFLoader, loading pdf per each page and giving page
metadata.
  - **Issue:** #12273 
  - **Twitter handle:** @blue0_0hope

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
pull/12731/head
Dave Kwon 11 months ago committed by GitHub
parent 7148f3e1fe
commit b1954aab13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -128,18 +128,36 @@ class PyPDFParser(BaseBlobParser):
class PDFMinerParser(BaseBlobParser):
"""Parse `PDF` using `PDFMiner`."""
def __init__(self, extract_images: bool = False):
def __init__(self, extract_images: bool = False, *, concatenate_pages: bool = True):
"""Initialize a parser based on PDFMiner.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
self.extract_images = extract_images
self.concatenate_pages = concatenate_pages
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
if not self.extract_images:
from pdfminer.high_level import extract_text
with blob.as_bytes_io() as pdf_file_obj:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)
if self.concatenate_pages:
text = extract_text(pdf_file_obj)
metadata = {"source": blob.source}
yield Document(page_content=text, metadata=metadata)
else:
from pdfminer.pdfpage import PDFPage
pages = PDFPage.get_pages(pdf_file_obj)
for i, _ in enumerate(pages):
text = extract_text(pdf_file_obj, page_numbers=[i])
metadata = {"source": blob.source, "page": str(i)}
yield Document(page_content=text, metadata=metadata)
else:
import io

@ -251,8 +251,15 @@ class PDFMinerLoader(BasePDFLoader):
*,
headers: Optional[Dict] = None,
extract_images: bool = False,
concatenate_pages: bool = True,
) -> None:
"""Initialize with file path."""
"""Initialize with file path.
Args:
extract_images: Whether to extract images from PDF.
concatenate_pages: If True, concatenate all PDF pages into one a single
document. Otherwise, return one document per page.
"""
try:
from pdfminer.high_level import extract_text # noqa:F401
except ImportError:
@ -262,7 +269,9 @@ class PDFMinerLoader(BasePDFLoader):
)
super().__init__(file_path, headers=headers)
self.parser = PDFMinerParser(extract_images=extract_images)
self.parser = PDFMinerParser(
extract_images=extract_images, concatenate_pages=concatenate_pages
)
def load(self) -> List[Document]:
"""Eagerly load the content."""

@ -56,6 +56,19 @@ def test_pdfminer_loader() -> None:
docs = loader.load()
assert len(docs) == 1
# Verify that concatenating pages parameter works
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=True)
docs = loader.load()
assert len(docs) == 1
file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf"
loader = PDFMinerLoader(str(file_path), concatenate_pages=False)
docs = loader.load()
assert len(docs) == 16
def test_pdfminer_pdf_as_html_loader() -> None:
"""Test PDFMinerPDFasHTMLLoader."""

Loading…
Cancel
Save