diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index d6c8951efe..e8ef11de1c 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -96,6 +96,47 @@ class GoogleDriveLoader(BaseLoader, BaseModel): return creds + def _load_sheet_from_id(self, id: str) -> List[Document]: + """Load a sheet and all tabs from an ID.""" + + from googleapiclient.discovery import build + + creds = self._load_credentials() + sheets_service = build("sheets", "v4", credentials=creds) + spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute() + sheets = spreadsheet.get("sheets", []) + + documents = [] + for sheet in sheets: + sheet_name = sheet["properties"]["title"] + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=id, range=sheet_name) + .execute() + ) + values = result.get("values", []) + + header = values[0] + for i, row in enumerate(values[1:], start=1): + metadata = { + "source": ( + f"https://docs.google.com/spreadsheets/d/{id}/" + f"edit?gid={sheet['properties']['sheetId']}" + ), + "title": f"{spreadsheet['properties']['title']} - {sheet_name}", + "row": i, + } + content = [] + for j, v in enumerate(row): + title = header[j].strip() if len(header) > j else "" + content.append(f"{title}: {v.strip()}") + + page_content = "\n".join(content) + documents.append(Document(page_content=page_content, metadata=metadata)) + + return documents + def _load_document_from_id(self, id: str) -> Document: """Load a document from an ID.""" from io import BytesIO @@ -137,6 +178,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel): for item in items: if item["mimeType"] == "application/vnd.google-apps.document": returns.append(self._load_document_from_id(item["id"])) + elif item["mimeType"] == "application/vnd.google-apps.spreadsheet": + returns.extend(self._load_sheet_from_id(item["id"])) elif item["mimeType"] == "application/pdf": returns.extend(self._load_file_from_id(item["id"])) else: