diff --git a/docs/modules/document_loaders/examples/googledrive.ipynb b/docs/modules/document_loaders/examples/googledrive.ipynb new file mode 100644 index 00000000..126252e0 --- /dev/null +++ b/docs/modules/document_loaders/examples/googledrive.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0ed136e-6983-4893-ae1b-b75753af05f8", + "metadata": {}, + "source": [ + "# Google Drive\n", + "This notebook covers how to load documents from Google Drive. Currently, only Google Docs are supported.\n", + "\n", + "## Prerequisites\n", + "\n", + "1. Create a Google Cloud project or use an existing project\n", + "1. Enable the [Google Drive API](https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com)\n", + "1. [Authorize credentials for desktop app](https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application)\n", + "1. `pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib`\n", + "\n", + "## 🧑 Instructions for ingesting your Google Docs data\n", + "By default, the `GoogleDriveLoader` expects the `credentials.json` file to be `~/.credentials/credentials.json`, but this is configurable using the `credentials_file` keyword argument. Same thing with `token.json`. Note that `token.json` will be created automatically the first time you use the loader.\n", + "\n", + "`GoogleDriveLoader` can load from a list of Google Docs document ids or a folder id. You can obtain your folder and document id from the URL:\n", + "* Folder: https://drive.google.com/drive/u/0/folders/1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5 -> folder id is `\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\"`\n", + "* Document: https://docs.google.com/document/d/1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw/edit -> document id is `\"1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw\"`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "878928a6-a5ae-4f74-b351-64e3b01733fe", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import GoogleDriveLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2216c83f-68e4-4d2f-8ea2-5878fb18bbe7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "loader = GoogleDriveLoader(folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8f3b6aa0-b45d-4e37-8c50-5bebe70fdb9d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index d564f63b..328fcdb9 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,6 +1,7 @@ """All different types of document loaders.""" from langchain.document_loaders.directory import DirectoryLoader +from langchain.document_loaders.googledrive import GoogleDriveLoader from langchain.document_loaders.html import UnstructuredHTMLLoader from langchain.document_loaders.notion import NotionDirectoryLoader from langchain.document_loaders.pdf import UnstructuredPDFLoader @@ -13,6 +14,7 @@ __all__ = [ "DirectoryLoader", "NotionDirectoryLoader", "ReadTheDocsLoader", + "GoogleDriveLoader", "UnstructuredHTMLLoader", "UnstructuredPowerPointLoader", "UnstructuredPDFLoader", diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py new file mode 100644 index 00000000..db9f3d3d --- /dev/null +++ b/langchain/document_loaders/googledrive.py @@ -0,0 +1,141 @@ +"""Loader that loads data from Google Drive.""" + +# Prerequisites: +# 1. Create a Google Cloud project +# 2. Enable the Google Drive API: +# https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com +# 3. Authorize credentials for desktop app: +# https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501 + + +from pathlib import Path +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, root_validator, validator + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +SCOPES = ["https://www.googleapis.com/auth/drive.readonly"] + + +class GoogleDriveLoader(BaseLoader, BaseModel): + """Loader that loads Google Docs from Google Drive.""" + + credentials_path: Path = Path.home() / ".credentials" / "credentials.json" + token_path: Path = Path.home() / ".credentials" / "token.json" + folder_id: Optional[str] = None + document_ids: Optional[List[str]] = None + + @root_validator + def validate_folder_id_or_document_ids( + cls, values: Dict[str, Any] + ) -> Dict[str, Any]: + """Validate that either folder_id or document_ids is set, but not both.""" + if values.get("folder_id") and values.get("document_ids"): + raise ValueError("Cannot specify both folder_id and document_ids") + if not values.get("folder_id") and not values.get("document_ids"): + raise ValueError("Must specify either folder_id or document_ids") + return values + + @validator("credentials_path") + def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any: + """Validate that credentials_path exists.""" + if not v.exists(): + raise ValueError(f"credentials_path {v} does not exist") + return v + + def _load_credentials(self) -> Any: + """Load credentials.""" + # Adapted from https://developers.google.com/drive/api/v3/quickstart/python + try: + from google.auth.transport.requests import Request + from google.oauth2.credentials import Credentials + from google_auth_oauthlib.flow import InstalledAppFlow + except ImportError: + raise ImportError( + "You must run" + "`pip install --upgrade " + "google-api-python-client google-auth-httplib2 " + "google-auth-oauthlib`" + "to use the Google Drive loader." + ) + + creds = None + if self.token_path.exists(): + creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES) + + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + str(self.credentials_path), SCOPES + ) + creds = flow.run_local_server(port=0) + with open(self.token_path, "w") as token: + token.write(creds.to_json()) + + return creds + + def _load_document_from_id(self, id: str) -> Document: + """Load a document from an ID.""" + from io import BytesIO + + from googleapiclient.discovery import build + from googleapiclient.http import MediaIoBaseDownload + + creds = self._load_credentials() + service = build("drive", "v3", credentials=creds) + + request = service.files().export_media(fileId=id, mimeType="text/plain") + fh = BytesIO() + downloader = MediaIoBaseDownload(fh, request) + done = False + while done is False: + status, done = downloader.next_chunk() + text = fh.getvalue().decode("utf-8") + metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"} + return Document(page_content=text, metadata=metadata) + + def _load_documents_from_folder(self) -> List[Document]: + """Load documents from a folder.""" + from googleapiclient.discovery import build + + creds = self._load_credentials() + service = build("drive", "v3", credentials=creds) + + results = ( + service.files() + .list( + q=f"'{self.folder_id}' in parents", + pageSize=1000, + fields="nextPageToken, files(id, name, mimeType)", + ) + .execute() + ) + items = results.get("files", []) + + docs = [] + for item in items: + # Only support Google Docs for now + if item["mimeType"] == "application/vnd.google-apps.document": + docs.append(self._load_document_from_id(item["id"])) + return docs + + def _load_documents_from_ids(self) -> List[Document]: + """Load documents from a list of IDs.""" + if not self.document_ids: + raise ValueError("document_ids must be set") + + docs = [] + for doc_id in self.document_ids: + docs.append(self._load_document_from_id(doc_id)) + return docs + + def load(self) -> List[Document]: + """Load documents.""" + if self.folder_id: + return self._load_documents_from_folder() + else: + return self._load_documents_from_ids()