Load specific file types from Google Drive (issue #4878) (#4926)

# Load specific file types from Google Drive (issue #4878)
Add the possibility to define what file types you want to load from
Google Drive.
 
```
 loader = GoogleDriveLoader(
    folder_id="1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5",
    file_types=["document", "pdf"]
    recursive=False
)
```

Fixes ##4878

## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
DataLoaders
- @eyurtsev

Twitter: [@UmerHAdil](https://twitter.com/@UmerHAdil) | Discord:
RicChilligerDude#7589

---------

Co-authored-by: UmerHA <40663591+UmerHA@users.noreply.github.com>
This commit is contained in:
Eugene Yurtsev 2023-05-18 09:27:53 -04:00 committed by GitHub
parent dfbf45f028
commit c06a47a691
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 69 additions and 7 deletions

View File

@ -1,6 +1,7 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8", "id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {}, "metadata": {},
@ -75,6 +76,29 @@
"source": [ "source": [
"docs = loader.load()" "docs = loader.load()"
] ]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2721ba8a",
"metadata": {},
"source": [
"When you pass a `folder_id` by default all files of type document, sheet and pdf are loaded. You can modify this behaviour by passing a `file_types` argument "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ff83b4c",
"metadata": {},
"outputs": [],
"source": [
"loader = GoogleDriveLoader(\n",
" folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\",\n",
" file_types=[\"document\", \"sheet\"]\n",
" recursive=False\n",
")"
]
} }
], ],
"metadata": { "metadata": {

View File

@ -10,7 +10,7 @@
# https://cloud.google.com/iam/docs/service-accounts-create # https://cloud.google.com/iam/docs/service-accounts-create
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Union from typing import Any, Dict, List, Optional, Sequence, Union
from pydantic import BaseModel, root_validator, validator from pydantic import BaseModel, root_validator, validator
@ -30,11 +30,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
document_ids: Optional[List[str]] = None document_ids: Optional[List[str]] = None
file_ids: Optional[List[str]] = None file_ids: Optional[List[str]] = None
recursive: bool = False recursive: bool = False
file_types: Optional[Sequence[str]] = None
@root_validator @root_validator
def validate_folder_id_or_document_ids( def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
"""Validate that either folder_id or document_ids is set, but not both.""" """Validate that either folder_id or document_ids is set, but not both."""
if values.get("folder_id") and ( if values.get("folder_id") and (
values.get("document_ids") or values.get("file_ids") values.get("document_ids") or values.get("file_ids")
@ -49,6 +48,35 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
and not values.get("file_ids") and not values.get("file_ids")
): ):
raise ValueError("Must specify either folder_id, document_ids, or file_ids") raise ValueError("Must specify either folder_id, document_ids, or file_ids")
file_types = values.get("file_types")
if file_types:
if values.get("document_ids") or values.get("file_ids"):
raise ValueError(
"file_types can only be given when folder_id is given,"
" (not when document_ids or file_ids are given)."
)
type_mapping = {
"document": "application/vnd.google-apps.document",
"sheet": "application/vnd.google-apps.spreadsheet",
"pdf": "application/pdf",
}
allowed_types = list(type_mapping.keys()) + list(type_mapping.values())
short_names = ", ".join([f"'{x}'" for x in type_mapping.keys()])
full_names = ", ".join([f"'{x}'" for x in type_mapping.values()])
for file_type in file_types:
if file_type not in allowed_types:
raise ValueError(
f"Given file type {file_type} is not supported. "
f"Supported values are: {short_names}; and "
f"their full-form names: {full_names}"
)
# replace short-form file types by full-form file types
def full_form(x: str) -> str:
return type_mapping[x] if x in type_mapping else x
values["file_types"] = [full_form(file_type) for file_type in file_types]
return values return values
@validator("credentials_path") @validator("credentials_path")
@ -171,15 +199,23 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
} }
return Document(page_content=text, metadata=metadata) return Document(page_content=text, metadata=metadata)
def _load_documents_from_folder(self, folder_id: str) -> List[Document]: def _load_documents_from_folder(
self, folder_id: str, *, file_types: Optional[Sequence[str]] = None
) -> List[Document]:
"""Load documents from a folder.""" """Load documents from a folder."""
from googleapiclient.discovery import build from googleapiclient.discovery import build
creds = self._load_credentials() creds = self._load_credentials()
service = build("drive", "v3", credentials=creds) service = build("drive", "v3", credentials=creds)
files = self._fetch_files_recursive(service, folder_id) files = self._fetch_files_recursive(service, folder_id)
# If file types filter is provided, we'll filter by the file type.
if file_types:
_files = [f for f in files if f["mimeType"] in file_types] # type: ignore
else:
_files = files
returns = [] returns = []
for file in files: for file in _files:
if file["mimeType"] == "application/vnd.google-apps.document": if file["mimeType"] == "application/vnd.google-apps.document":
returns.append(self._load_document_from_id(file["id"])) # type: ignore returns.append(self._load_document_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet": elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
@ -271,7 +307,9 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
if self.folder_id: if self.folder_id:
return self._load_documents_from_folder(self.folder_id) return self._load_documents_from_folder(
self.folder_id, file_types=self.file_types
)
elif self.document_ids: elif self.document_ids:
return self._load_documents_from_ids() return self._load_documents_from_ids()
else: else: