@ -1,8 +1,10 @@
""" Loader that loads PDF files. """
""" Loader that loads PDF files. """
import logging
import os
import os
import tempfile
import tempfile
from abc import ABC
from abc import ABC
from io import StringIO
from io import StringIO
from pathlib import Path
from typing import Any , List , Optional
from typing import Any , List , Optional
from urllib . parse import urlparse
from urllib . parse import urlparse
@ -12,6 +14,8 @@ from langchain.docstore.document import Document
from langchain . document_loaders . base import BaseLoader
from langchain . document_loaders . base import BaseLoader
from langchain . document_loaders . unstructured import UnstructuredFileLoader
from langchain . document_loaders . unstructured import UnstructuredFileLoader
logger = logging . getLogger ( __file__ )
class UnstructuredPDFLoader ( UnstructuredFileLoader ) :
class UnstructuredPDFLoader ( UnstructuredFileLoader ) :
""" Loader that uses unstructured to load PDF files. """
""" Loader that uses unstructured to load PDF files. """
@ -106,6 +110,51 @@ class PyPDFLoader(BasePDFLoader):
]
]
class PyPDFDirectoryLoader ( BaseLoader ) :
""" Loads a directory with PDF files with pypdf and chunks at character level.
Loader also stores page numbers in metadatas .
"""
def __init__ (
self ,
path : str ,
glob : str = " **/[!.]*.pdf " ,
silent_errors : bool = False ,
load_hidden : bool = False ,
recursive : bool = False ,
) :
self . path = path
self . glob = glob
self . load_hidden = load_hidden
self . recursive = recursive
self . silent_errors = silent_errors
@staticmethod
def _is_visible ( path : Path ) - > bool :
return not any ( part . startswith ( " . " ) for part in path . parts )
def load ( self ) - > List [ Document ] :
p = Path ( self . path )
docs = [ ]
items = p . rglob ( self . glob ) if self . recursive else p . glob ( self . glob )
for i in items :
if i . is_file ( ) :
if self . _is_visible ( i . relative_to ( p ) ) or self . load_hidden :
try :
loader = PyPDFLoader ( str ( i ) )
sub_docs = loader . load ( )
for doc in sub_docs :
doc . metadata [ " source " ] = str ( i )
docs . extend ( sub_docs )
except Exception as e :
if self . silent_errors :
logger . warning ( e )
else :
raise e
return docs
class PDFMinerLoader ( BasePDFLoader ) :
class PDFMinerLoader ( BasePDFLoader ) :
""" Loader that uses PDFMiner to load PDF files. """
""" Loader that uses PDFMiner to load PDF files. """