Harrison/gdrive enhancements (#6375)

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
2024-11-06 03:20:49 +00:00 · 2023-06-18 11:07:23 -07:00 · 2023-06-18 11:07:23 -07:00 · a8cb9ee013
commit a8cb9ee013
parent ebfffaa38f
2 changed files with 161 additions and 18 deletions
--- a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
@ -1,7 +1,6 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
   "metadata": {},
@ -78,7 +77,6 @@
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2721ba8a",
   "metadata": {},
@ -99,6 +97,135 @@
    "    recursive=False\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6b80931",
   "metadata": {},
   "source": [
    "## Passing in Optional File Loaders\n",
    "\n",
    "When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "94207e39",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import GoogleDriveLoader\n",
    "from langchain.document_loaders import UnstructuredFileIOLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a15fbee0",
   "metadata": {},
   "outputs": [],
   "source": [
    "file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
    "loader = GoogleDriveLoader(\n",
    "    file_ids=[file_id],\n",
    "    file_loader_cls=UnstructuredFileIOLoader,\n",
    "    file_loader_kwargs={\"mode\": \"elements\"}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "98410bda",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e3e72221",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(page_content='\\n  \\n    \\n      Team\\n      Location\\n      Stanley Cups\\n    \\n    \\n      Blues\\n      STL\\n      1\\n    \\n    \\n      Flyers\\n      PHI\\n      2\\n    \\n    \\n      Maple Leafs\\n      TOR\\n      13\\n    \\n  \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n  <tbody>\\n    <tr>\\n      <td>Team</td>\\n      <td>Location</td>\\n      <td>Stanley Cups</td>\\n    </tr>\\n    <tr>\\n      <td>Blues</td>\\n      <td>STL</td>\\n      <td>1</td>\\n    </tr>\\n    <tr>\\n      <td>Flyers</td>\\n      <td>PHI</td>\\n      <td>2</td>\\n    </tr>\\n    <tr>\\n      <td>Maple Leafs</td>\\n      <td>TOR</td>\\n      <td>13</td>\\n    </tr>\\n  </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "238cd06f",
   "metadata": {},
   "source": [
    "You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0e2d093f",
   "metadata": {},
   "outputs": [],
   "source": [
    "folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
    "loader = GoogleDriveLoader(\n",
    "    folder_id=folder_id,\n",
    "    file_loader_cls=UnstructuredFileIOLoader,\n",
    "    file_loader_kwargs={\"mode\": \"elements\"}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b35ddcc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3cc141e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Document(page_content='\\n  \\n    \\n      Team\\n      Location\\n      Stanley Cups\\n    \\n    \\n      Blues\\n      STL\\n      1\\n    \\n    \\n      Flyers\\n      PHI\\n      2\\n    \\n    \\n      Maple Leafs\\n      TOR\\n      13\\n    \\n  \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n  <tbody>\\n    <tr>\\n      <td>Team</td>\\n      <td>Location</td>\\n      <td>Stanley Cups</td>\\n    </tr>\\n    <tr>\\n      <td>Blues</td>\\n      <td>STL</td>\\n      <td>1</td>\\n    </tr>\\n    <tr>\\n      <td>Flyers</td>\\n      <td>PHI</td>\\n      <td>2</td>\\n    </tr>\\n    <tr>\\n      <td>Maple Leafs</td>\\n      <td>TOR</td>\\n      <td>13</td>\\n    </tr>\\n  </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e312268a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
@ -117,7 +244,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.13"
  }
 },
 "nbformat": 4,
--- a/langchain/document_loaders/googledrive.py
+++ b/langchain/document_loaders/googledrive.py
@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
    recursive: bool = False
    file_types: Optional[Sequence[str]] = None
    load_trashed_files: bool = False
    # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
    # results in pydantic validation errors
    file_loader_cls: Any = None
    file_loader_kwargs: Dict["str", Any] = {}
    @root_validator
    def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
                returns.append(self._load_document_from_id(file["id"]))  # type: ignore
            elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
                returns.extend(self._load_sheet_from_id(file["id"]))  # type: ignore
-            elif file["mimeType"] == "application/pdf":
+            elif (
                file["mimeType"] == "application/pdf"
                or self.file_loader_cls is not None
            ):
                returns.extend(self._load_file_from_id(file["id"]))  # type: ignore
            else:
                pass
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
        done = False
        while done is False:
            status, done = downloader.next_chunk()
        content = fh.getvalue()
-        from PyPDF2 import PdfReader
+        if self.file_loader_cls is not None:
            fh.seek(0)
            loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
            docs = loader.load()
            for doc in docs:
                doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
            return docs
-        pdf_reader = PdfReader(BytesIO(content))
+        else:
            from PyPDF2 import PdfReader
-        return [
+            content = fh.getvalue()
-            Document(
+            pdf_reader = PdfReader(BytesIO(content))
-                page_content=page.extract_text(),
+
-                metadata={
+            return [
-                    "source": f"https://drive.google.com/file/d/{id}/view",
+                Document(
-                    "title": f"{file.get('name')}",
+                    page_content=page.extract_text(),
-                    "page": i,
+                    metadata={
-                },
+                        "source": f"https://drive.google.com/file/d/{id}/view",
-            )
+                        "title": f"{file.get('name')}",
-            for i, page in enumerate(pdf_reader.pages)
+                        "page": i,
-        ]
+                    },
                )
                for i, page in enumerate(pdf_reader.pages)
            ]
    def _load_file_from_ids(self) -> List[Document]:
        """Load files from a list of IDs."""