Harrison/pypdf loader (#3764)

Co-authored-by: Felipe Meres <felipe@felipemeres.com>
fix_agent_callbacks
Harrison Chase 1 year ago committed by GitHub
parent 4eefea0fe8
commit 7a129ac043
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -566,10 +566,50 @@
"Additionally, you can pass along any of the options from the [PyMuPDF documentation](https://pymupdf.readthedocs.io/en/latest/app1.html#plain-text/) as keyword arguments in the `load` call, and it will be pass along to the `get_text()` call."
]
},
{
"cell_type": "markdown",
"id": "f0048206",
"metadata": {},
"source": [
"## PyPDF Directory\n",
"\n",
"Load PDFs from directory"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ecd0cb16",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFDirectoryLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "96592167",
"metadata": {},
"outputs": [],
"source": [
"loader = PyPDFDirectoryLoader(\"example_data/\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c750454c",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1bf73c97",
"id": "ab7f8fdb",
"metadata": {},
"outputs": [],
"source": []
@ -577,9 +617,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "langchain_dev",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "langchain_dev"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@ -591,7 +631,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -55,6 +55,7 @@ from langchain.document_loaders.pdf import (
PDFMinerLoader,
PDFMinerPDFasHTMLLoader,
PyMuPDFLoader,
PyPDFDirectoryLoader,
PyPDFLoader,
UnstructuredPDFLoader,
)
@ -166,4 +167,5 @@ __all__ = [
"WebBaseLoader",
"WhatsAppChatLoader",
"YoutubeLoader",
"PyPDFDirectoryLoader",
]

@ -1,8 +1,10 @@
"""Loader that loads PDF files."""
import logging
import os
import tempfile
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any, List, Optional
from urllib.parse import urlparse
@ -12,6 +14,8 @@ from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
logger = logging.getLogger(__file__)
class UnstructuredPDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load PDF files."""
@ -106,6 +110,51 @@ class PyPDFLoader(BasePDFLoader):
]
class PyPDFDirectoryLoader(BaseLoader):
"""Loads a directory with PDF files with pypdf and chunks at character level.
Loader also stores page numbers in metadatas.
"""
def __init__(
self,
path: str,
glob: str = "**/[!.]*.pdf",
silent_errors: bool = False,
load_hidden: bool = False,
recursive: bool = False,
):
self.path = path
self.glob = glob
self.load_hidden = load_hidden
self.recursive = recursive
self.silent_errors = silent_errors
@staticmethod
def _is_visible(path: Path) -> bool:
return not any(part.startswith(".") for part in path.parts)
def load(self) -> List[Document]:
p = Path(self.path)
docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
for i in items:
if i.is_file():
if self._is_visible(i.relative_to(p)) or self.load_hidden:
try:
loader = PyPDFLoader(str(i))
sub_docs = loader.load()
for doc in sub_docs:
doc.metadata["source"] = str(i)
docs.extend(sub_docs)
except Exception as e:
if self.silent_errors:
logger.warning(e)
else:
raise e
return docs
class PDFMinerLoader(BasePDFLoader):
"""Loader that uses PDFMiner to load PDF files."""

Loading…
Cancel
Save