From 64f44c64839fa5530b1ae6490dae640dd4074a5e Mon Sep 17 00:00:00 2001 From: Patrick Storm Date: Sat, 1 Apr 2023 08:43:34 -0700 Subject: [PATCH] Add titles to metadatas in gdrive loader (#2260) I noticed the Googledrive loader does not have the "title" metadata for google docs and PDFs. This just adds that info to match the sheets. --- langchain/document_loaders/googledrive.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index cf3d44ee..8695bcdf 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -148,6 +148,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel): creds = self._load_credentials() service = build("drive", "v3", credentials=creds) + file = service.files().get(fileId=id).execute() request = service.files().export_media(fileId=id, mimeType="text/plain") fh = BytesIO() downloader = MediaIoBaseDownload(fh, request) @@ -163,7 +164,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel): print("An error occurred: {}".format(e)) text = fh.getvalue().decode("utf-8") - metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"} + metadata = { + "source": f"https://docs.google.com/document/d/{id}/edit", + "title": f"{file.get('name')}", + } return Document(page_content=text, metadata=metadata) def _load_documents_from_folder(self) -> List[Document]: @@ -213,6 +217,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel): creds = self._load_credentials() service = build("drive", "v3", credentials=creds) + file = service.files().get(fileId=id).execute() request = service.files().get_media(fileId=id) fh = BytesIO() downloader = MediaIoBaseDownload(fh, request) @@ -230,6 +235,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel): page_content=page.extract_text(), metadata={ "source": f"https://drive.google.com/file/d/{id}/view", + "title": f"{file.get('name')}", "page": i, }, )