From 519f0187b6fe7bd5d3b28b16b59ebc3f96fa4fc3 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 8 Mar 2023 20:15:36 -0800 Subject: [PATCH] Harrison/gdrive pdf (#1433) Co-authored-by: LM <93918064+LuisMalhadas@users.noreply.github.com> Co-authored-by: Luis Malhadas --- langchain/document_loaders/googledrive.py | 79 +++++++++++++++++++---- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index 10a9f655..cec86f50 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -26,16 +26,26 @@ class GoogleDriveLoader(BaseLoader, BaseModel): token_path: Path = Path.home() / ".credentials" / "token.json" folder_id: Optional[str] = None document_ids: Optional[List[str]] = None + file_ids: Optional[List[str]] = None @root_validator def validate_folder_id_or_document_ids( cls, values: Dict[str, Any] ) -> Dict[str, Any]: """Validate that either folder_id or document_ids is set, but not both.""" - if values.get("folder_id") and values.get("document_ids"): - raise ValueError("Cannot specify both folder_id and document_ids") - if not values.get("folder_id") and not values.get("document_ids"): - raise ValueError("Must specify either folder_id or document_ids") + if values.get("folder_id") and ( + values.get("document_ids") or values.get("file_ids") + ): + raise ValueError( + "Cannot specify both folder_id and document_ids nor " + "folder_id and file_ids" + ) + if ( + not values.get("folder_id") + and not values.get("document_ids") + and not values.get("file_ids") + ): + raise ValueError("Must specify either folder_id, document_ids, or file_ids") return values @validator("credentials_path") @@ -115,13 +125,16 @@ class GoogleDriveLoader(BaseLoader, BaseModel): .execute() ) items = results.get("files", []) + returns = [] + for item in items: + if item["mimeType"] == "application/vnd.google-apps.document": + returns.append(self._load_document_from_id(item["id"])) + elif item["mimeType"] == "application/pdf": + returns.extend(self._load_file_from_id(item["id"])) + else: + pass - return [ - self._load_document_from_id(item["id"]) - for item in items - # Only support Google Docs for now - if item["mimeType"] == "application/vnd.google-apps.document" - ] + return returns def _load_documents_from_ids(self) -> List[Document]: """Load documents from a list of IDs.""" @@ -130,9 +143,53 @@ class GoogleDriveLoader(BaseLoader, BaseModel): return [self._load_document_from_id(doc_id) for doc_id in self.document_ids] + def _load_file_from_id(self, id: str) -> List[Document]: + """Load a file from an ID.""" + from io import BytesIO + + from googleapiclient.discovery import build + from googleapiclient.http import MediaIoBaseDownload + + creds = self._load_credentials() + service = build("drive", "v3", credentials=creds) + + request = service.files().get_media(fileId=id) + fh = BytesIO() + downloader = MediaIoBaseDownload(fh, request) + done = False + while done is False: + status, done = downloader.next_chunk() + content = fh.getvalue() + + from PyPDF2 import PdfReader + + pdf_reader = PdfReader(BytesIO(content)) + + return [ + Document( + page_content=page.extract_text(), + metadata={ + "source": f"https://drive.google.com/file/d/{id}/view", + "page": i, + }, + ) + for i, page in enumerate(pdf_reader.pages) + ] + + def _load_file_from_ids(self) -> List[Document]: + """Load files from a list of IDs.""" + if not self.file_ids: + raise ValueError("file_ids must be set") + docs = [] + for file_id in self.file_ids: + docs.extend(self._load_file_from_id(file_id)) + return docs + def load(self) -> List[Document]: """Load documents.""" if self.folder_id: return self._load_documents_from_folder() - else: + elif self.document_ids: return self._load_documents_from_ids() + else: + return self._load_file_from_ids()