forked from Archives/langchain
# Load specific file types from Google Drive (issue #4878) Add the possibility to define what file types you want to load from Google Drive. ``` loader = GoogleDriveLoader( folder_id="1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5", file_types=["document", "pdf"] recursive=False ) ``` Fixes ##4878 ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: DataLoaders - @eyurtsev Twitter: [@UmerHAdil](https://twitter.com/@UmerHAdil) | Discord: RicChilligerDude#7589 --------- Co-authored-by: UmerHA <40663591+UmerHA@users.noreply.github.com>
This commit is contained in:
parent
dfbf45f028
commit
c06a47a691
@ -1,6 +1,7 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
|
||||
"metadata": {},
|
||||
@ -75,6 +76,29 @@
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "2721ba8a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When you pass a `folder_id` by default all files of type document, sheet and pdf are loaded. You can modify this behaviour by passing a `file_types` argument "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2ff83b4c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = GoogleDriveLoader(\n",
|
||||
" folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\",\n",
|
||||
" file_types=[\"document\", \"sheet\"]\n",
|
||||
" recursive=False\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -10,7 +10,7 @@
|
||||
# https://cloud.google.com/iam/docs/service-accounts-create
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from typing import Any, Dict, List, Optional, Sequence, Union
|
||||
|
||||
from pydantic import BaseModel, root_validator, validator
|
||||
|
||||
@ -30,11 +30,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
document_ids: Optional[List[str]] = None
|
||||
file_ids: Optional[List[str]] = None
|
||||
recursive: bool = False
|
||||
file_types: Optional[Sequence[str]] = None
|
||||
|
||||
@root_validator
|
||||
def validate_folder_id_or_document_ids(
|
||||
cls, values: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate that either folder_id or document_ids is set, but not both."""
|
||||
if values.get("folder_id") and (
|
||||
values.get("document_ids") or values.get("file_ids")
|
||||
@ -49,6 +48,35 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
and not values.get("file_ids")
|
||||
):
|
||||
raise ValueError("Must specify either folder_id, document_ids, or file_ids")
|
||||
|
||||
file_types = values.get("file_types")
|
||||
if file_types:
|
||||
if values.get("document_ids") or values.get("file_ids"):
|
||||
raise ValueError(
|
||||
"file_types can only be given when folder_id is given,"
|
||||
" (not when document_ids or file_ids are given)."
|
||||
)
|
||||
type_mapping = {
|
||||
"document": "application/vnd.google-apps.document",
|
||||
"sheet": "application/vnd.google-apps.spreadsheet",
|
||||
"pdf": "application/pdf",
|
||||
}
|
||||
allowed_types = list(type_mapping.keys()) + list(type_mapping.values())
|
||||
short_names = ", ".join([f"'{x}'" for x in type_mapping.keys()])
|
||||
full_names = ", ".join([f"'{x}'" for x in type_mapping.values()])
|
||||
for file_type in file_types:
|
||||
if file_type not in allowed_types:
|
||||
raise ValueError(
|
||||
f"Given file type {file_type} is not supported. "
|
||||
f"Supported values are: {short_names}; and "
|
||||
f"their full-form names: {full_names}"
|
||||
)
|
||||
|
||||
# replace short-form file types by full-form file types
|
||||
def full_form(x: str) -> str:
|
||||
return type_mapping[x] if x in type_mapping else x
|
||||
|
||||
values["file_types"] = [full_form(file_type) for file_type in file_types]
|
||||
return values
|
||||
|
||||
@validator("credentials_path")
|
||||
@ -171,15 +199,23 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
}
|
||||
return Document(page_content=text, metadata=metadata)
|
||||
|
||||
def _load_documents_from_folder(self, folder_id: str) -> List[Document]:
|
||||
def _load_documents_from_folder(
|
||||
self, folder_id: str, *, file_types: Optional[Sequence[str]] = None
|
||||
) -> List[Document]:
|
||||
"""Load documents from a folder."""
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
creds = self._load_credentials()
|
||||
service = build("drive", "v3", credentials=creds)
|
||||
files = self._fetch_files_recursive(service, folder_id)
|
||||
# If file types filter is provided, we'll filter by the file type.
|
||||
if file_types:
|
||||
_files = [f for f in files if f["mimeType"] in file_types] # type: ignore
|
||||
else:
|
||||
_files = files
|
||||
|
||||
returns = []
|
||||
for file in files:
|
||||
for file in _files:
|
||||
if file["mimeType"] == "application/vnd.google-apps.document":
|
||||
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
||||
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||
@ -271,7 +307,9 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
if self.folder_id:
|
||||
return self._load_documents_from_folder(self.folder_id)
|
||||
return self._load_documents_from_folder(
|
||||
self.folder_id, file_types=self.file_types
|
||||
)
|
||||
elif self.document_ids:
|
||||
return self._load_documents_from_ids()
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user