From 7bea3b302c02a9d2d125fbcbdad3d5f3ab48e9bc Mon Sep 17 00:00:00 2001 From: Patrick Storm Date: Wed, 29 Mar 2023 07:56:04 -0700 Subject: [PATCH] Add ability for GoogleDrive loader to load google sheets (#2135) Currently only google documents and pdfs can be loaded from google drive. This PR implements the latest recommended method for getting google sheets including all tabs. It currently parses the google sheet data the exact same way as the csv loader - the only difference is that the gdrive sheets loader is not using the `csv` library since the data is already in a list. --- langchain/document_loaders/googledrive.py | 43 +++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index d6c8951e..e8ef11de 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -96,6 +96,47 @@ class GoogleDriveLoader(BaseLoader, BaseModel): return creds + def _load_sheet_from_id(self, id: str) -> List[Document]: + """Load a sheet and all tabs from an ID.""" + + from googleapiclient.discovery import build + + creds = self._load_credentials() + sheets_service = build("sheets", "v4", credentials=creds) + spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute() + sheets = spreadsheet.get("sheets", []) + + documents = [] + for sheet in sheets: + sheet_name = sheet["properties"]["title"] + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=id, range=sheet_name) + .execute() + ) + values = result.get("values", []) + + header = values[0] + for i, row in enumerate(values[1:], start=1): + metadata = { + "source": ( + f"https://docs.google.com/spreadsheets/d/{id}/" + f"edit?gid={sheet['properties']['sheetId']}" + ), + "title": f"{spreadsheet['properties']['title']} - {sheet_name}", + "row": i, + } + content = [] + for j, v in enumerate(row): + title = header[j].strip() if len(header) > j else "" + content.append(f"{title}: {v.strip()}") + + page_content = "\n".join(content) + documents.append(Document(page_content=page_content, metadata=metadata)) + + return documents + def _load_document_from_id(self, id: str) -> Document: """Load a document from an ID.""" from io import BytesIO @@ -137,6 +178,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel): for item in items: if item["mimeType"] == "application/vnd.google-apps.document": returns.append(self._load_document_from_id(item["id"])) + elif item["mimeType"] == "application/vnd.google-apps.spreadsheet": + returns.extend(self._load_sheet_from_id(item["id"])) elif item["mimeType"] == "application/pdf": returns.extend(self._load_file_from_id(item["id"])) else: