@ -1,8 +1,10 @@
""" Loader that loads PDF files. """
import logging
import os
import tempfile
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any , List , Optional
from urllib . parse import urlparse
@ -12,6 +14,8 @@ from langchain.docstore.document import Document
from langchain . document_loaders . base import BaseLoader
from langchain . document_loaders . unstructured import UnstructuredFileLoader
logger = logging . getLogger ( __file__ )
class UnstructuredPDFLoader ( UnstructuredFileLoader ) :
""" Loader that uses unstructured to load PDF files. """
@ -106,6 +110,51 @@ class PyPDFLoader(BasePDFLoader):
]
class PyPDFDirectoryLoader ( BaseLoader ) :
""" Loads a directory with PDF files with pypdf and chunks at character level.
Loader also stores page numbers in metadatas .
"""
def __init__ (
self ,
path : str ,
glob : str = " **/[!.]*.pdf " ,
silent_errors : bool = False ,
load_hidden : bool = False ,
recursive : bool = False ,
) :
self . path = path
self . glob = glob
self . load_hidden = load_hidden
self . recursive = recursive
self . silent_errors = silent_errors
@staticmethod
def _is_visible ( path : Path ) - > bool :
return not any ( part . startswith ( " . " ) for part in path . parts )
def load ( self ) - > List [ Document ] :
p = Path ( self . path )
docs = [ ]
items = p . rglob ( self . glob ) if self . recursive else p . glob ( self . glob )
for i in items :
if i . is_file ( ) :
if self . _is_visible ( i . relative_to ( p ) ) or self . load_hidden :
try :
loader = PyPDFLoader ( str ( i ) )
sub_docs = loader . load ( )
for doc in sub_docs :
doc . metadata [ " source " ] = str ( i )
docs . extend ( sub_docs )
except Exception as e :
if self . silent_errors :
logger . warning ( e )
else :
raise e
return docs
class PDFMinerLoader ( BasePDFLoader ) :
""" Loader that uses PDFMiner to load PDF files. """