Load specific file types from Google Drive (issue #4878) (#4926)

# Load specific file types from Google Drive (issue #4878)
Add the possibility to define what file types you want to load from
Google Drive.
 
```
 loader = GoogleDriveLoader(
    folder_id="1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5",
    file_types=["document", "pdf"]
    recursive=False
)
```

Fixes ##4878

## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
DataLoaders
- @eyurtsev

Twitter: [@UmerHAdil](https://twitter.com/@UmerHAdil) | Discord:
RicChilligerDude#7589

---------

Co-authored-by: UmerHA <40663591+UmerHA@users.noreply.github.com>
This commit is contained in:
Eugene Yurtsev 2023-05-18 09:27:53 -04:00 committed by GitHub
parent dfbf45f028
commit c06a47a691
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 69 additions and 7 deletions

View File

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {},
@ -75,6 +76,29 @@
"source": [
"docs = loader.load()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "2721ba8a",
"metadata": {},
"source": [
"When you pass a `folder_id` by default all files of type document, sheet and pdf are loaded. You can modify this behaviour by passing a `file_types` argument "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2ff83b4c",
"metadata": {},
"outputs": [],
"source": [
"loader = GoogleDriveLoader(\n",
" folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\",\n",
" file_types=[\"document\", \"sheet\"]\n",
" recursive=False\n",
")"
]
}
],
"metadata": {

View File

@ -10,7 +10,7 @@
# https://cloud.google.com/iam/docs/service-accounts-create
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Sequence, Union
from pydantic import BaseModel, root_validator, validator
@ -30,11 +30,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
document_ids: Optional[List[str]] = None
file_ids: Optional[List[str]] = None
recursive: bool = False
file_types: Optional[Sequence[str]] = None
@root_validator
def validate_folder_id_or_document_ids(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that either folder_id or document_ids is set, but not both."""
if values.get("folder_id") and (
values.get("document_ids") or values.get("file_ids")
@ -49,6 +48,35 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
and not values.get("file_ids")
):
raise ValueError("Must specify either folder_id, document_ids, or file_ids")
file_types = values.get("file_types")
if file_types:
if values.get("document_ids") or values.get("file_ids"):
raise ValueError(
"file_types can only be given when folder_id is given,"
" (not when document_ids or file_ids are given)."
)
type_mapping = {
"document": "application/vnd.google-apps.document",
"sheet": "application/vnd.google-apps.spreadsheet",
"pdf": "application/pdf",
}
allowed_types = list(type_mapping.keys()) + list(type_mapping.values())
short_names = ", ".join([f"'{x}'" for x in type_mapping.keys()])
full_names = ", ".join([f"'{x}'" for x in type_mapping.values()])
for file_type in file_types:
if file_type not in allowed_types:
raise ValueError(
f"Given file type {file_type} is not supported. "
f"Supported values are: {short_names}; and "
f"their full-form names: {full_names}"
)
# replace short-form file types by full-form file types
def full_form(x: str) -> str:
return type_mapping[x] if x in type_mapping else x
values["file_types"] = [full_form(file_type) for file_type in file_types]
return values
@validator("credentials_path")
@ -171,15 +199,23 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
}
return Document(page_content=text, metadata=metadata)
def _load_documents_from_folder(self, folder_id: str) -> List[Document]:
def _load_documents_from_folder(
self, folder_id: str, *, file_types: Optional[Sequence[str]] = None
) -> List[Document]:
"""Load documents from a folder."""
from googleapiclient.discovery import build
creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
files = self._fetch_files_recursive(service, folder_id)
# If file types filter is provided, we'll filter by the file type.
if file_types:
_files = [f for f in files if f["mimeType"] in file_types] # type: ignore
else:
_files = files
returns = []
for file in files:
for file in _files:
if file["mimeType"] == "application/vnd.google-apps.document":
returns.append(self._load_document_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
@ -271,7 +307,9 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
def load(self) -> List[Document]:
"""Load documents."""
if self.folder_id:
return self._load_documents_from_folder(self.folder_id)
return self._load_documents_from_folder(
self.folder_id, file_types=self.file_types
)
elif self.document_ids:
return self._load_documents_from_ids()
else: