Harrison/gdrive enhancements (#6375)

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
11 months ago · a8cb9ee013
parent ebfffaa38f
commit a8cb9ee013
2 changed files with 162 additions and 19 deletions
--- a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
@ -1,7 +1,6 @@
 {
 "cells": [
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
   "metadata": {},
@ -78,7 +77,6 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "2721ba8a",
   "metadata": {},
@ -99,6 +97,135 @@
    "    recursive=False\n",
    ")"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d6b80931",
+   "metadata": {},
+   "source": [
+    "## Passing in Optional File Loaders\n",
+    "\n",
+    "When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "94207e39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import GoogleDriveLoader\n",
+    "from langchain.document_loaders import UnstructuredFileIOLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a15fbee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
+    "loader = GoogleDriveLoader(\n",
+    "    file_ids=[file_id],\n",
+    "    file_loader_cls=UnstructuredFileIOLoader,\n",
+    "    file_loader_kwargs={\"mode\": \"elements\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "98410bda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e3e72221",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='\\n  \\n    \\n      Team\\n      Location\\n      Stanley Cups\\n    \\n    \\n      Blues\\n      STL\\n      1\\n    \\n    \\n      Flyers\\n      PHI\\n      2\\n    \\n    \\n      Maple Leafs\\n      TOR\\n      13\\n    \\n  \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n  <tbody>\\n    <tr>\\n      <td>Team</td>\\n      <td>Location</td>\\n      <td>Stanley Cups</td>\\n    </tr>\\n    <tr>\\n      <td>Blues</td>\\n      <td>STL</td>\\n      <td>1</td>\\n    </tr>\\n    <tr>\\n      <td>Flyers</td>\\n      <td>PHI</td>\\n      <td>2</td>\\n    </tr>\\n    <tr>\\n      <td>Maple Leafs</td>\\n      <td>TOR</td>\\n      <td>13</td>\\n    </tr>\\n  </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "238cd06f",
+   "metadata": {},
+   "source": [
+    "You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0e2d093f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
+    "loader = GoogleDriveLoader(\n",
+    "    folder_id=folder_id,\n",
+    "    file_loader_cls=UnstructuredFileIOLoader,\n",
+    "    file_loader_kwargs={\"mode\": \"elements\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b35ddcc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3cc141e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='\\n  \\n    \\n      Team\\n      Location\\n      Stanley Cups\\n    \\n    \\n      Blues\\n      STL\\n      1\\n    \\n    \\n      Flyers\\n      PHI\\n      2\\n    \\n    \\n      Maple Leafs\\n      TOR\\n      13\\n    \\n  \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n  <tbody>\\n    <tr>\\n      <td>Team</td>\\n      <td>Location</td>\\n      <td>Stanley Cups</td>\\n    </tr>\\n    <tr>\\n      <td>Blues</td>\\n      <td>STL</td>\\n      <td>1</td>\\n    </tr>\\n    <tr>\\n      <td>Flyers</td>\\n      <td>PHI</td>\\n      <td>2</td>\\n    </tr>\\n    <tr>\\n      <td>Maple Leafs</td>\\n      <td>TOR</td>\\n      <td>13</td>\\n    </tr>\\n  </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e312268a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
@ -117,7 +244,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.13"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/googledrive.py
+++ b/langchain/document_loaders/googledrive.py
@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
    recursive: bool = False
    file_types: Optional[Sequence[str]] = None
    load_trashed_files: bool = False
+    # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
+    # results in pydantic validation errors
+    file_loader_cls: Any = None
+    file_loader_kwargs: Dict["str", Any] = {}

    @root_validator
    def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
                returns.append(self._load_document_from_id(file["id"]))  # type: ignore
            elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
                returns.extend(self._load_sheet_from_id(file["id"]))  # type: ignore
-            elif file["mimeType"] == "application/pdf":
+            elif (
+                file["mimeType"] == "application/pdf"
+                or self.file_loader_cls is not None
+            ):
                returns.extend(self._load_file_from_id(file["id"]))  # type: ignore
            else:
                pass
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
        done = False
        while done is False:
            status, done = downloader.next_chunk()
-        content = fh.getvalue()

-        from PyPDF2 import PdfReader
+        if self.file_loader_cls is not None:
+            fh.seek(0)
+            loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
+            docs = loader.load()
+            for doc in docs:
+                doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
+            return docs

-        pdf_reader = PdfReader(BytesIO(content))
-
-        return [
-            Document(
-                page_content=page.extract_text(),
-                metadata={
-                    "source": f"https://drive.google.com/file/d/{id}/view",
-                    "title": f"{file.get('name')}",
-                    "page": i,
-                },
-            )
-            for i, page in enumerate(pdf_reader.pages)
-        ]
+        else:
+            from PyPDF2 import PdfReader
+
+            content = fh.getvalue()
+            pdf_reader = PdfReader(BytesIO(content))
+
+            return [
+                Document(
+                    page_content=page.extract_text(),
+                    metadata={
+                        "source": f"https://drive.google.com/file/d/{id}/view",
+                        "title": f"{file.get('name')}",
+                        "page": i,
+                    },
+                )
+                for i, page in enumerate(pdf_reader.pages)
+            ]

    def _load_file_from_ids(self) -> List[Document]:
        """Load files from a list of IDs."""