@ -7,7 +7,7 @@ import time
from abc import ABC
from io import StringIO
from pathlib import Path
from typing import Any , Iterator , List , Mapping , Optional , Union
from typing import Any , Iterator , List , Mapping , Optional , Sequence, Union
from urllib . parse import urlparse
import requests
@ -16,6 +16,7 @@ from langchain.docstore.document import Document
from langchain . document_loaders . base import BaseLoader
from langchain . document_loaders . blob_loaders import Blob
from langchain . document_loaders . parsers . pdf import (
AmazonTextractPDFParser ,
PDFMinerParser ,
PDFPlumberParser ,
PyMuPDFParser ,
@ -71,8 +72,14 @@ class BasePDFLoader(BaseLoader, ABC):
if " ~ " in self . file_path :
self . file_path = os . path . expanduser ( self . file_path )
# If the file is a web path , download it to a temporary file, and use that
# If the file is a web path or S3 , download it to a temporary file, and use that
if not os . path . isfile ( self . file_path ) and self . _is_valid_url ( self . file_path ) :
self . temp_dir = tempfile . TemporaryDirectory ( )
_ , suffix = os . path . splitext ( self . file_path )
temp_pdf = os . path . join ( self . temp_dir . name , f " tmp { suffix } " )
if self . _is_s3_url ( self . file_path ) :
self . web_path = self . file_path
else :
r = requests . get ( self . file_path )
if r . status_code != 200 :
@ -82,8 +89,6 @@ class BasePDFLoader(BaseLoader, ABC):
)
self . web_path = self . file_path
self . temp_dir = tempfile . TemporaryDirectory ( )
temp_pdf = Path ( self . temp_dir . name ) / " tmp.pdf "
with open ( temp_pdf , mode = " wb " ) as f :
f . write ( r . content )
self . file_path = str ( temp_pdf )
@ -100,6 +105,17 @@ class BasePDFLoader(BaseLoader, ABC):
parsed = urlparse ( url )
return bool ( parsed . netloc ) and bool ( parsed . scheme )
@staticmethod
def _is_s3_url ( url : str ) - > bool :
""" check if the url is S3 """
try :
result = urlparse ( url )
if result . scheme == " s3 " and result . netloc :
return True
return False
except ValueError :
return False
@property
def source ( self ) - > str :
return self . web_path if self . web_path is not None else self . file_path
@ -440,3 +456,144 @@ class PDFPlumberLoader(BasePDFLoader):
parser = PDFPlumberParser ( text_kwargs = self . text_kwargs )
blob = Blob . from_path ( self . file_path )
return parser . parse ( blob )
class AmazonTextractPDFLoader ( BasePDFLoader ) :
""" Loads a PDF document from local file system, HTTP or S3.
To authenticate , the AWS client uses the following methods to
automatically load credentials :
https : / / boto3 . amazonaws . com / v1 / documentation / api / latest / guide / credentials . html
If a specific credential profile should be used , you must pass
the name of the profile from the ~ / . aws / credentials file that is to be used .
Make sure the credentials / roles used have the required policies to
access the Amazon Textract service .
Example :
. . code - block : : python
from langchain . document_loaders import AmazonTextractPDFLoader
loader = AmazonTextractPDFLoader (
file_path = " s3://pdfs/myfile.pdf "
)
document = loader . load ( )
"""
def __init__ (
self ,
file_path : str ,
textract_features : Optional [ Sequence [ str ] ] = None ,
client : Optional [ Any ] = None ,
credentials_profile_name : Optional [ str ] = None ,
region_name : Optional [ str ] = None ,
endpoint_url : Optional [ str ] = None ,
) - > None :
""" Initialize the loader.
Args :
file_path : A file , url or s3 path for input file
textract_features : Features to be used for extraction , each feature
should be passed as a str that conforms to the enum
` Textract_Features ` , see ` amazon - textract - caller ` pkg
client : boto3 textract client ( Optional )
credentials_profile_name : AWS profile name , if not default ( Optional )
region_name : AWS region , eg us - east - 1 ( Optional )
endpoint_url : endpoint url for the textract service ( Optional )
"""
super ( ) . __init__ ( file_path )
try :
import textractcaller as tc # noqa: F401
except ImportError :
raise ModuleNotFoundError (
" Could not import amazon-textract-caller python package. "
" Please install it with `pip install amazon-textract-caller`. "
)
if textract_features :
features = [ tc . Textract_Features [ x ] for x in textract_features ]
else :
features = [ ]
if credentials_profile_name or region_name or endpoint_url :
try :
import boto3
if credentials_profile_name is not None :
session = boto3 . Session ( profile_name = credentials_profile_name )
else :
# use default credentials
session = boto3 . Session ( )
client_params = { }
if region_name :
client_params [ " region_name " ] = region_name
if endpoint_url :
client_params [ " endpoint_url " ] = endpoint_url
client = session . client ( " textract " , * * client_params )
except ImportError :
raise ModuleNotFoundError (
" Could not import boto3 python package. "
" Please install it with `pip install boto3`. "
)
except Exception as e :
raise ValueError (
" Could not load credentials to authenticate with AWS client. "
" Please check that credentials in the specified "
" profile name are valid. "
) from e
self . parser = AmazonTextractPDFParser ( textract_features = features , client = client )
def load ( self ) - > List [ Document ] :
""" Load given path as pages. """
return list ( self . lazy_load ( ) )
def lazy_load (
self ,
) - > Iterator [ Document ] :
""" Lazy load documents """
# the self.file_path is local, but the blob has to include
# the S3 location if the file originated from S3 for multi-page documents
# raises ValueError when multi-page and not on S3"""
if self . web_path and self . _is_s3_url ( self . web_path ) :
blob = Blob ( path = self . web_path )
else :
blob = Blob . from_path ( self . file_path )
if AmazonTextractPDFLoader . _get_number_of_pages ( blob ) > 1 :
raise ValueError (
f " the file { blob . path } is a multi-page document, \
but not stored on S3 . \
Textract requires multi - page documents to be on S3 . "
)
yield from self . parser . parse ( blob )
@staticmethod
def _get_number_of_pages ( blob : Blob ) - > int :
try :
import pypdf
from PIL import Image , ImageSequence
except ImportError :
raise ModuleNotFoundError (
" Could not import pypdf or Pilloe python package. "
" Please install it with `pip install pypdf Pillow`. "
)
if blob . mimetype == " application/pdf " :
with blob . as_bytes_io ( ) as input_pdf_file :
pdf_reader = pypdf . PdfReader ( input_pdf_file )
return len ( pdf_reader . pages )
elif blob . mimetype == " image/tiff " :
num_pages = 0
img = Image . open ( blob . as_bytes ( ) )
for _ , _ in enumerate ( ImageSequence . Iterator ( img ) ) :
num_pages + = 1
return num_pages
elif blob . mimetype in [ " image/png " , " image/jpeg " ] :
return 1
else :
raise ValueError ( f " unsupported mime type: { blob . mimetype } " )