@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
recursive : bool = False
file_types : Optional [ Sequence [ str ] ] = None
load_trashed_files : bool = False
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
# results in pydantic validation errors
file_loader_cls : Any = None
file_loader_kwargs : Dict [ " str " , Any ] = { }
@root_validator
def validate_inputs ( cls , values : Dict [ str , Any ] ) - > Dict [ str , Any ] :
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
returns . append ( self . _load_document_from_id ( file [ " id " ] ) ) # type: ignore
elif file [ " mimeType " ] == " application/vnd.google-apps.spreadsheet " :
returns . extend ( self . _load_sheet_from_id ( file [ " id " ] ) ) # type: ignore
elif file [ " mimeType " ] == " application/pdf " :
elif (
file [ " mimeType " ] == " application/pdf "
or self . file_loader_cls is not None
) :
returns . extend ( self . _load_file_from_id ( file [ " id " ] ) ) # type: ignore
else :
pass
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
done = False
while done is False :
status , done = downloader . next_chunk ( )
content = fh . getvalue ( )
from PyPDF2 import PdfReader
if self . file_loader_cls is not None :
fh . seek ( 0 )
loader = self . file_loader_cls ( file = fh , * * self . file_loader_kwargs )
docs = loader . load ( )
for doc in docs :
doc . metadata [ " source " ] = f " https://drive.google.com/file/d/ { id } /view "
return docs
pdf_reader = PdfReader ( BytesIO ( content ) )
return [
Document (
page_content = page . extract_text ( ) ,
metadata = {
" source " : f " https://drive.google.com/file/d/ { id } /view " ,
" title " : f " { file . get ( ' name ' ) } " ,
" page " : i ,
} ,
)
for i , page in enumerate ( pdf_reader . pages )
]
else :
from PyPDF2 import PdfReader
content = fh . getvalue ( )
pdf_reader = PdfReader ( BytesIO ( content ) )
return [
Document (
page_content = page . extract_text ( ) ,
metadata = {
" source " : f " https://drive.google.com/file/d/ { id } /view " ,
" title " : f " { file . get ( ' name ' ) } " ,
" page " : i ,
} ,
)
for i , page in enumerate ( pdf_reader . pages )
]
def _load_file_from_ids ( self ) - > List [ Document ] :
""" Load files from a list of IDs. """