Add ability for GoogleDrive loader to load google sheets (#2135)

Currently only google documents and pdfs can be loaded from google
drive. This PR implements the latest recommended method for getting
google sheets including all tabs.

It currently parses the google sheet data the exact same way as the csv
loader - the only difference is that the gdrive sheets loader is not
using the `csv` library since the data is already in a list.
This commit is contained in:
Patrick Storm 2023-03-29 07:56:04 -07:00 committed by GitHub
parent b5449a866d
commit 7bea3b302c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -96,6 +96,47 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
return creds
def _load_sheet_from_id(self, id: str) -> List[Document]:
"""Load a sheet and all tabs from an ID."""
from googleapiclient.discovery import build
creds = self._load_credentials()
sheets_service = build("sheets", "v4", credentials=creds)
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute()
sheets = spreadsheet.get("sheets", [])
documents = []
for sheet in sheets:
sheet_name = sheet["properties"]["title"]
result = (
sheets_service.spreadsheets()
.values()
.get(spreadsheetId=id, range=sheet_name)
.execute()
)
values = result.get("values", [])
header = values[0]
for i, row in enumerate(values[1:], start=1):
metadata = {
"source": (
f"https://docs.google.com/spreadsheets/d/{id}/"
f"edit?gid={sheet['properties']['sheetId']}"
),
"title": f"{spreadsheet['properties']['title']} - {sheet_name}",
"row": i,
}
content = []
for j, v in enumerate(row):
title = header[j].strip() if len(header) > j else ""
content.append(f"{title}: {v.strip()}")
page_content = "\n".join(content)
documents.append(Document(page_content=page_content, metadata=metadata))
return documents
def _load_document_from_id(self, id: str) -> Document:
"""Load a document from an ID."""
from io import BytesIO
@ -137,6 +178,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
for item in items:
if item["mimeType"] == "application/vnd.google-apps.document":
returns.append(self._load_document_from_id(item["id"]))
elif item["mimeType"] == "application/vnd.google-apps.spreadsheet":
returns.extend(self._load_sheet_from_id(item["id"]))
elif item["mimeType"] == "application/pdf":
returns.extend(self._load_file_from_id(item["id"]))
else: