Harrison/pypdf loader (#3764)

Co-authored-by: Felipe Meres <felipe@felipemeres.com>
This commit is contained in:
Harrison Chase 2023-04-28 19:56:21 -07:00 committed by GitHub
parent 4eefea0fe8
commit 7a129ac043
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 95 additions and 4 deletions

View File

@ -566,10 +566,50 @@
"Additionally, you can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call." "Additionally, you can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call."
] ]
}, },
{
"cell_type": "markdown",
"id": "f0048206",
"metadata": {},
"source": [
"## PyPDF Directory\n",
"\n",
"Load PDFs from directory"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ecd0cb16",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFDirectoryLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "96592167",
"metadata": {},
"outputs": [],
"source": [
"loader = PyPDFDirectoryLoader(\"example_data/\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c750454c",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "1bf73c97", "id": "ab7f8fdb",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []
@ -577,9 +617,9 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "langchain_dev", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "langchain_dev" "name": "python3"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
@ -591,7 +631,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.8" "version": "3.9.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
PDFMinerLoader, PDFMinerLoader,
PDFMinerPDFasHTMLLoader, PDFMinerPDFasHTMLLoader,
PyMuPDFLoader, PyMuPDFLoader,
PyPDFDirectoryLoader,
PyPDFLoader, PyPDFLoader,
UnstructuredPDFLoader, UnstructuredPDFLoader,
) )
@ -166,4 +167,5 @@ __all__ = [
"WebBaseLoader", "WebBaseLoader",
"WhatsAppChatLoader", "WhatsAppChatLoader",
"YoutubeLoader", "YoutubeLoader",
"PyPDFDirectoryLoader",
] ]

View File

@ -1,8 +1,10 @@
"""Loader that loads PDF files.""" """Loader that loads PDF files."""
import logging
import os import os
import tempfile import tempfile
from abc import ABC from abc import ABC
from io import StringIO from io import StringIO
from pathlib import Path
from typing import Any, List, Optional from typing import Any, List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
@ -12,6 +14,8 @@ from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader
logger = logging.getLogger(__file__)
class UnstructuredPDFLoader(UnstructuredFileLoader): class UnstructuredPDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PDF files.""" """Loader that uses unstructured to load PDF files."""
@ -106,6 +110,51 @@ class PyPDFLoader(BasePDFLoader):
] ]
class PyPDFDirectoryLoader(BaseLoader):
"""Loads a directory with PDF files with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(
self,
path: str,
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,
recursive: bool = False,
):
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.recursive = recursive
self.silent_errors = silent_errors
@staticmethod
def _is_visible(path: Path) -> bool:
return not any(part.startswith(".") for part in path.parts)
def load(self) -> List[Document]:
p = Path(self.path)
docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
for i in items:
if i.is_file():
if self._is_visible(i.relative_to(p)) or self.load_hidden:
try:
loader = PyPDFLoader(str(i))
sub_docs = loader.load()
for doc in sub_docs:
doc.metadata["source"] = str(i)
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(e)
else:
raise e
return docs
class PDFMinerLoader(BasePDFLoader): class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files.""" """Loader that uses PDFMiner to load PDF files."""