Add titles to metadatas in gdrive loader (#2260)

I noticed the Googledrive loader does not have the "title" metadata for
google docs and PDFs. This just adds that info to match the sheets.
doc
Patrick Storm 1 year ago committed by GitHub
parent 4b59bb55c7
commit 64f44c6483
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -148,6 +148,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id).execute()
request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request)
@ -163,7 +164,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
print("An error occurred: {}".format(e))
text = fh.getvalue().decode("utf-8")
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"}
metadata = {
"source": f"https://docs.google.com/document/d/{id}/edit",
"title": f"{file.get('name')}",
}
return Document(page_content=text, metadata=metadata)
def _load_documents_from_folder(self) -> List[Document]:
@ -213,6 +217,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id).execute()
request = service.files().get_media(fileId=id)
fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request)
@ -230,6 +235,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
page_content=page.extract_text(),
metadata={
"source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i,
},
)

Loading…
Cancel
Save