From a8cb9ee013f1aea9d700e917cba6f254145f696f Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 18 Jun 2023 11:07:23 -0700 Subject: [PATCH] Harrison/gdrive enhancements (#6375) Co-authored-by: Matt Robinson --- .../integrations/google_drive.ipynb | 133 +++++++++++++++++- langchain/document_loaders/googledrive.py | 48 ++++--- 2 files changed, 162 insertions(+), 19 deletions(-) diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb index e5346130..fe59db46 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "b0ed136e-6983-4893-ae1b-b75753af05f8", "metadata": {}, @@ -78,7 +77,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2721ba8a", "metadata": {}, @@ -99,6 +97,135 @@ " recursive=False\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "d6b80931", + "metadata": {}, + "source": [ + "## Passing in Optional File Loaders\n", + "\n", + "When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "94207e39", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import GoogleDriveLoader\n", + "from langchain.document_loaders import UnstructuredFileIOLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a15fbee0", + "metadata": {}, + "outputs": [], + "source": [ + "file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n", + "loader = GoogleDriveLoader(\n", + " file_ids=[file_id],\n", + " file_loader_cls=UnstructuredFileIOLoader,\n", + " file_loader_kwargs={\"mode\": \"elements\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "98410bda", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e3e72221", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "markdown", + "id": "238cd06f", + "metadata": {}, + "source": [ + "You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0e2d093f", + "metadata": {}, + "outputs": [], + "source": [ + "folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n", + "loader = GoogleDriveLoader(\n", + " folder_id=folder_id,\n", + " file_loader_cls=UnstructuredFileIOLoader,\n", + " file_loader_kwargs={\"mode\": \"elements\"}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b35ddcc6", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cc141e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '\\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n
TeamLocationStanley Cups
BluesSTL1
FlyersPHI2
Maple LeafsTOR13
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e312268a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -117,7 +244,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.8.13" } }, "nbformat": 4, diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py index c28bf705..5e21d401 100644 --- a/langchain/document_loaders/googledrive.py +++ b/langchain/document_loaders/googledrive.py @@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel): recursive: bool = False file_types: Optional[Sequence[str]] = None load_trashed_files: bool = False + # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently + # results in pydantic validation errors + file_loader_cls: Any = None + file_loader_kwargs: Dict["str", Any] = {} @root_validator def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: @@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel): returns.append(self._load_document_from_id(file["id"])) # type: ignore elif file["mimeType"] == "application/vnd.google-apps.spreadsheet": returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore - elif file["mimeType"] == "application/pdf": + elif ( + file["mimeType"] == "application/pdf" + or self.file_loader_cls is not None + ): returns.extend(self._load_file_from_id(file["id"])) # type: ignore else: pass @@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel): done = False while done is False: status, done = downloader.next_chunk() - content = fh.getvalue() - from PyPDF2 import PdfReader + if self.file_loader_cls is not None: + fh.seek(0) + loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs) + docs = loader.load() + for doc in docs: + doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view" + return docs - pdf_reader = PdfReader(BytesIO(content)) - - return [ - Document( - page_content=page.extract_text(), - metadata={ - "source": f"https://drive.google.com/file/d/{id}/view", - "title": f"{file.get('name')}", - "page": i, - }, - ) - for i, page in enumerate(pdf_reader.pages) - ] + else: + from PyPDF2 import PdfReader + + content = fh.getvalue() + pdf_reader = PdfReader(BytesIO(content)) + + return [ + Document( + page_content=page.extract_text(), + metadata={ + "source": f"https://drive.google.com/file/d/{id}/view", + "title": f"{file.get('name')}", + "page": i, + }, + ) + for i, page in enumerate(pdf_reader.pages) + ] def _load_file_from_ids(self) -> List[Document]: """Load files from a list of IDs."""