Add titles to metadatas in gdrive loader (#2260)

I noticed the Googledrive loader does not have the "title" metadata for
google docs and PDFs. This just adds that info to match the sheets.
doc
Patrick Storm 1 year ago committed by GitHub
parent 4b59bb55c7
commit 64f44c6483
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -148,6 +148,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials() creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id).execute()
request = service.files().export_media(fileId=id, mimeType="text/plain") request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO() fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request) downloader = MediaIoBaseDownload(fh, request)
@ -163,7 +164,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
print("An error occurred: {}".format(e)) print("An error occurred: {}".format(e))
text = fh.getvalue().decode("utf-8") text = fh.getvalue().decode("utf-8")
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"} metadata = {
"source": f"https://docs.google.com/document/d/{id}/edit",
"title": f"{file.get('name')}",
}
return Document(page_content=text, metadata=metadata) return Document(page_content=text, metadata=metadata)
def _load_documents_from_folder(self) -> List[Document]: def _load_documents_from_folder(self) -> List[Document]:
@ -213,6 +217,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
creds = self._load_credentials() creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) service = build("drive", "v3", credentials=creds)
file = service.files().get(fileId=id).execute()
request = service.files().get_media(fileId=id) request = service.files().get_media(fileId=id)
fh = BytesIO() fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request) downloader = MediaIoBaseDownload(fh, request)
@ -230,6 +235,7 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
page_content=page.extract_text(), page_content=page.extract_text(),
metadata={ metadata={
"source": f"https://drive.google.com/file/d/{id}/view", "source": f"https://drive.google.com/file/d/{id}/view",
"title": f"{file.get('name')}",
"page": i, "page": i,
}, },
) )

Loading…
Cancel
Save