diff --git a/docs/modules/indexes/document_loaders/examples/pdf.ipynb b/docs/modules/indexes/document_loaders/examples/pdf.ipynb index 6fcb8175..8d2e4270 100644 --- a/docs/modules/indexes/document_loaders/examples/pdf.ipynb +++ b/docs/modules/indexes/document_loaders/examples/pdf.ipynb @@ -566,10 +566,50 @@ "Additionally, you can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call." ] }, + { + "cell_type": "markdown", + "id": "f0048206", + "metadata": {}, + "source": [ + "## PyPDF Directory\n", + "\n", + "Load PDFs from directory" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ecd0cb16", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFDirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "96592167", + "metadata": {}, + "outputs": [], + "source": [ + "loader = PyPDFDirectoryLoader(\"example_data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c750454c", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "1bf73c97", + "id": "ab7f8fdb", "metadata": {}, "outputs": [], "source": [] @@ -577,9 +617,9 @@ ], "metadata": { "kernelspec": { - "display_name": "langchain_dev", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "langchain_dev" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -591,7 +631,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3d978679..47c3fc02 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import ( PDFMinerLoader, PDFMinerPDFasHTMLLoader, PyMuPDFLoader, + PyPDFDirectoryLoader, PyPDFLoader, UnstructuredPDFLoader, ) @@ -166,4 +167,5 @@ __all__ = [ "WebBaseLoader", "WhatsAppChatLoader", "YoutubeLoader", + "PyPDFDirectoryLoader", ] diff --git a/langchain/document_loaders/pdf.py b/langchain/document_loaders/pdf.py index 16a17115..9ff57d77 100644 --- a/langchain/document_loaders/pdf.py +++ b/langchain/document_loaders/pdf.py @@ -1,8 +1,10 @@ """Loader that loads PDF files.""" +import logging import os import tempfile from abc import ABC from io import StringIO +from pathlib import Path from typing import Any, List, Optional from urllib.parse import urlparse @@ -12,6 +14,8 @@ from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +logger = logging.getLogger(__file__) + class UnstructuredPDFLoader(UnstructuredFileLoader): """Loader that uses unstructured to load PDF files.""" @@ -106,6 +110,51 @@ class PyPDFLoader(BasePDFLoader): ] +class PyPDFDirectoryLoader(BaseLoader): + """Loads a directory with PDF files with pypdf and chunks at character level. + + Loader also stores page numbers in metadatas. + """ + + def __init__( + self, + path: str, + glob: str = "**/[!.]*.pdf", + silent_errors: bool = False, + load_hidden: bool = False, + recursive: bool = False, + ): + self.path = path + self.glob = glob + self.load_hidden = load_hidden + self.recursive = recursive + self.silent_errors = silent_errors + + @staticmethod + def _is_visible(path: Path) -> bool: + return not any(part.startswith(".") for part in path.parts) + + def load(self) -> List[Document]: + p = Path(self.path) + docs = [] + items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) + for i in items: + if i.is_file(): + if self._is_visible(i.relative_to(p)) or self.load_hidden: + try: + loader = PyPDFLoader(str(i)) + sub_docs = loader.load() + for doc in sub_docs: + doc.metadata["source"] = str(i) + docs.extend(sub_docs) + except Exception as e: + if self.silent_errors: + logger.warning(e) + else: + raise e + return docs + + class PDFMinerLoader(BasePDFLoader): """Loader that uses PDFMiner to load PDF files."""