diff --git a/docs/modules/indexes/document_loaders/examples/pdf.ipynb b/docs/modules/indexes/document_loaders/examples/pdf.ipynb index b8a222e9..e1ec7035 100644 --- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb +++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb @@ -372,6 +372,44 @@ { "cell_type": "code", "execution_count": 9, + "id": "483720b5", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "96351714", + "metadata": {}, + "source": [ + "# Using PyPDFium2" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "003fcc1d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFium2Loader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "46766e29", + "metadata": {}, + "outputs": [], + "source": [ + "loader = PyPDFium2Loader(\"example_data/layout-parser-paper.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "010d5cdd", "metadata": {}, "outputs": [], @@ -662,7 +700,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.11.3" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index b205ba29..e5c5eb1c 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -61,6 +61,7 @@ from langchain.document_loaders.pdf import ( PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFDirectoryLoader, + PyPDFium2Loader, PyPDFLoader, UnstructuredPDFLoader, ) @@ -161,6 +162,7 @@ __all__ = [ "PlaywrightURLLoader", "PyMuPDFLoader", "PyPDFLoader", + "PyPDFium2Loader", "PythonLoader", "ReadTheDocsLoader", "RoamLoader", diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 95a41f15..d950f3ff 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -115,6 +115,34 @@ class PyPDFLoader(BasePDFLoader): ] +class PyPDFium2Loader(BasePDFLoader): + """Loads a PDF with pypdfium2 and chunks at character level.""" + + def __init__(self, file_path: str): + """Initialize with file path.""" + try: + import pypdfium2 # noqa:F401 + except ImportError: + raise ValueError( + "pypdfium2 package not found, please install it with" + " `pip install pypdfium2`" + ) + super().__init__(file_path) + + def load(self) -> List[Document]: + """Load given path as pages.""" + import pypdfium2 + + with open(self.file_path, "rb") as f: + pdf_reader = pypdfium2.PdfDocument(f) + docs = [] + for i, page in enumerate(pdf_reader): + content = page.get_textpage().get_text_range() + metadata = {"source": self.file_path, "page": i} + docs.append(Document(page_content=content, metadata=metadata)) + return docs + + class PyPDFDirectoryLoader(BaseLoader): """Loads a directory with PDF files with pypdf and chunks at character level. diff --git a/tests/integration_tests/document_loaders/test_pdf.py b/tests/integration_tests/document_loaders/test_pdf.py index 8aa7fd7b..a5bc8cf1 100644 --- a/tests/integration_tests/document_loaders/test_pdf.py +++ b/tests/integration_tests/document_loaders/test_pdf.py @@ -1,12 +1,14 @@ from pathlib import Path from langchain.document_loaders import ( + MathpixPDFLoader, PDFMinerLoader, PDFMinerPDFasHTMLLoader, PyMuPDFLoader, + PyPDFium2Loader, + PyPDFLoader, UnstructuredPDFLoader, ) -from langchain.document_loaders.pdf import MathpixPDFLoader def test_unstructured_pdf_loader() -> None: @@ -48,6 +50,36 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PyPDFLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PyPDFLoader(str(file_path)) + + docs = loader.load() + assert len(docs) == 16 + + +def test_pypdfium2_loader() -> None: + """Test PyPDFium2Loader.""" + file_path = Path(__file__).parent.parent / "examples/hello.pdf" + loader = PyPDFium2Loader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + + file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" + loader = PyPDFium2Loader(str(file_path)) + + docs = loader.load() + assert len(docs) == 16 + + def test_pymupdf_loader() -> None: """Test PyMuPDF loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf"