diff --git a/docs/modules/indexes/document_loaders/examples/pdf.ipynb b/docs/modules/indexes/document_loaders/examples/pdf.ipynb index abccc80c97..762b9c7d1c 100644 --- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb +++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb @@ -97,7 +97,7 @@ }, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "OpenAI API Key: ········\n" @@ -673,6 +673,68 @@ "docs = loader.load()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "45bb0415", + "metadata": {}, + "source": [ + "## Using pdfplumber\n", + "\n", + "Like PyMuPDF, the output Documents contain detailed metadata about the PDF and its pages, and returns one document per page." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aefa758d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PDFPlumberLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "049e9d9a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = PDFPlumberLoader(\"example_data/layout-parser-paper.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a8610efa", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8132e551", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='LayoutParser: A Unified Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1 ((cid:0)), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1 Allen Institute for AI\\n1202 shannons@allenai.org\\n2 Brown University\\nruochen zhang@brown.edu\\n3 Harvard University\\nnuJ {melissadell,jacob carlson}@fas.harvard.edu\\n4 University of Washington\\nbcgl@cs.washington.edu\\n12 5 University of Waterloo\\nw422li@uwaterloo.ca\\n]VC.sc[\\nAbstract. Recentadvancesindocumentimageanalysis(DIA)havebeen\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomescouldbeeasilydeployedinproductionandextendedforfurther\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model configurations complicate the easy reuse of im-\\n2v84351.3012:viXra portantinnovationsbyawideaudience.Thoughtherehavebeenon-going\\nefforts to improve reusability and simplify deep learning (DL) model\\ndevelopmentindisciplineslikenaturallanguageprocessingandcomputer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademicresearchacross awiderangeof disciplinesinthesocialsciences\\nand humanities. This paper introduces LayoutParser, an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitiveinterfacesforapplyingandcustomizingDLmodelsforlayoutde-\\ntection,characterrecognition,andmanyotherdocumentprocessingtasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io.\\nKeywords: DocumentImageAnalysis·DeepLearning·LayoutAnalysis\\n· Character Recognition · Open Source library · Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocumentimageanalysis(DIA)tasksincludingdocumentimageclassification[11,', metadata={'source': 'example_data/layout-parser-paper.pdf', 'file_path': 'example_data/layout-parser-paper.pdf', 'page': 1, 'total_pages': 16, 'Author': '', 'CreationDate': 'D:20210622012710Z', 'Creator': 'LaTeX with hyperref', 'Keywords': '', 'ModDate': 'D:20210622012710Z', 'PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'Producer': 'pdfTeX-1.40.21', 'Subject': '', 'Title': '', 'Trapped': 'False'})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -698,7 +760,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 1b8aa3cb30..be3500cbf4 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -60,6 +60,7 @@ from langchain.document_loaders.pdf import ( OnlinePDFLoader, PDFMinerLoader, PDFMinerPDFasHTMLLoader, + PDFPlumberLoader, PyMuPDFLoader, PyPDFDirectoryLoader, PyPDFium2Loader, @@ -166,6 +167,7 @@ __all__ = [ "OutlookMessageLoader", "PDFMinerLoader", "PDFMinerPDFasHTMLLoader", + "PDFPlumberLoader", "PagedPDFSplitter", "PlaywrightURLLoader", "PyMuPDFLoader", diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py index b79b49422a..d1e72bbb08 100644 --- a/langchain/document_loaders/parsers/__init__.py +++ b/langchain/document_loaders/parsers/__init__.py @@ -1,8 +1,15 @@ from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, + PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, PyPDFParser, ) -__all__ = ["PyPDFParser", "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser"] +__all__ = [ + "PyPDFParser", + "PDFMinerParser", + "PyMuPDFParser", + "PyPDFium2Parser", + "PDFPlumberParser", +] diff --git a/langchain/document_loaders/parsers/pdf.py b/langchain/document_loaders/parsers/pdf.py index dcc729bdbe..f1f75280b2 100644 --- a/langchain/document_loaders/parsers/pdf.py +++ b/langchain/document_loaders/parsers/pdf.py @@ -99,3 +99,42 @@ class PyPDFium2Parser(BaseBlobParser): content = page.get_textpage().get_text_range() metadata = {"source": blob.source, "page": page_number} yield Document(page_content=content, metadata=metadata) + + +class PDFPlumberParser(BaseBlobParser): + """Parse PDFs with PDFPlumber.""" + + def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None: + """Initialize the parser. + + Args: + text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` + """ + self.text_kwargs = text_kwargs or {} + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + import pdfplumber + + with blob.as_bytes_io() as file_path: + doc = pdfplumber.open(file_path) # open document + + yield from [ + Document( + page_content=page.extract_text(**self.text_kwargs), + metadata=dict( + { + "source": blob.source, + "file_path": blob.source, + "page": page.page_number, + "total_pages": len(doc.pages), + }, + **{ + k: doc.metadata[k] + for k in doc.metadata + if type(doc.metadata[k]) in [str, int] + }, + ), + ) + for page in doc.pages + ] diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index fe84e0c0db..9a61f36c2f 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -7,7 +7,7 @@ import time from abc import ABC from io import StringIO from pathlib import Path -from typing import Any, Iterator, List, Optional +from typing import Any, Iterator, List, Mapping, Optional from urllib.parse import urlparse import requests @@ -17,6 +17,7 @@ from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, + PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, PyPDFParser, @@ -362,3 +363,29 @@ class MathpixPDFLoader(BasePDFLoader): contents = self.clean_pdf(contents) metadata = {"source": self.source, "file_path": self.source} return [Document(page_content=contents, metadata=metadata)] + + +class PDFPlumberLoader(BasePDFLoader): + """Loader that uses pdfplumber to load PDF files.""" + + def __init__( + self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None + ) -> None: + """Initialize with file path.""" + try: + import pdfplumber # noqa:F401 + except ImportError: + raise ValueError( + "pdfplumber package not found, please install it with " + "`pip install pdfplumber`" + ) + + super().__init__(file_path) + self.text_kwargs = text_kwargs or {} + + def load(self) -> List[Document]: + """Load file.""" + + parser = PDFPlumberParser(text_kwargs=self.text_kwargs) + blob = Blob.from_path(self.file_path) + return parser.parse(blob) diff --git a/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py index f847fb82a8..7b76e0f721 100644 --- a/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py +++ b/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py @@ -6,6 +6,7 @@ from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, + PDFPlumberParser, PyMuPDFParser, PyPDFium2Parser, PyPDFParser, @@ -78,3 +79,8 @@ def test_pypdfium2_parser() -> None: """Test PyPDFium2 parser.""" # Does not follow defaults to split by page. _assert_with_parser(PyPDFium2Parser()) + + +def test_pdfplumber_parser() -> None: + """Test PDFPlumber parser.""" + _assert_with_parser(PDFPlumberParser()) diff --git a/tests/integration_tests/document_loaders/parsers/test_public_api.py b/tests/integration_tests/document_loaders/parsers/test_public_api.py index 52ce7e8e3e..00da8749ac 100644 --- a/tests/integration_tests/document_loaders/parsers/test_public_api.py +++ b/tests/integration_tests/document_loaders/parsers/test_public_api.py @@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None: "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser", + "PDFPlumberParser", } diff --git a/tests/unit_tests/document_loader/parsers/test_public_api.py b/tests/unit_tests/document_loader/parsers/test_public_api.py index 52ce7e8e3e..00da8749ac 100644 --- a/tests/unit_tests/document_loader/parsers/test_public_api.py +++ b/tests/unit_tests/document_loader/parsers/test_public_api.py @@ -8,4 +8,5 @@ def test_parsers_public_api_correct() -> None: "PDFMinerParser", "PyMuPDFParser", "PyPDFium2Parser", + "PDFPlumberParser", }