forked from Archives/langchain
Add ability for GoogleDrive loader to load google sheets (#2135)
Currently only google documents and pdfs can be loaded from google drive. This PR implements the latest recommended method for getting google sheets including all tabs. It currently parses the google sheet data the exact same way as the csv loader - the only difference is that the gdrive sheets loader is not using the `csv` library since the data is already in a list.
This commit is contained in:
parent
b5449a866d
commit
7bea3b302c
@ -96,6 +96,47 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
|
||||
return creds
|
||||
|
||||
def _load_sheet_from_id(self, id: str) -> List[Document]:
|
||||
"""Load a sheet and all tabs from an ID."""
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
creds = self._load_credentials()
|
||||
sheets_service = build("sheets", "v4", credentials=creds)
|
||||
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute()
|
||||
sheets = spreadsheet.get("sheets", [])
|
||||
|
||||
documents = []
|
||||
for sheet in sheets:
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=id, range=sheet_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
header = values[0]
|
||||
for i, row in enumerate(values[1:], start=1):
|
||||
metadata = {
|
||||
"source": (
|
||||
f"https://docs.google.com/spreadsheets/d/{id}/"
|
||||
f"edit?gid={sheet['properties']['sheetId']}"
|
||||
),
|
||||
"title": f"{spreadsheet['properties']['title']} - {sheet_name}",
|
||||
"row": i,
|
||||
}
|
||||
content = []
|
||||
for j, v in enumerate(row):
|
||||
title = header[j].strip() if len(header) > j else ""
|
||||
content.append(f"{title}: {v.strip()}")
|
||||
|
||||
page_content = "\n".join(content)
|
||||
documents.append(Document(page_content=page_content, metadata=metadata))
|
||||
|
||||
return documents
|
||||
|
||||
def _load_document_from_id(self, id: str) -> Document:
|
||||
"""Load a document from an ID."""
|
||||
from io import BytesIO
|
||||
@ -137,6 +178,8 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
for item in items:
|
||||
if item["mimeType"] == "application/vnd.google-apps.document":
|
||||
returns.append(self._load_document_from_id(item["id"]))
|
||||
elif item["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||
returns.extend(self._load_sheet_from_id(item["id"]))
|
||||
elif item["mimeType"] == "application/pdf":
|
||||
returns.extend(self._load_file_from_id(item["id"]))
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user