diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
index e53461306e..fe59db46ce 100644
--- a/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
+++ b/docs/extras/modules/data_connection/document_loaders/integrations/google_drive.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {},
@@ -78,7 +77,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "2721ba8a",
"metadata": {},
@@ -99,6 +97,135 @@
" recursive=False\n",
")"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d6b80931",
+ "metadata": {},
+ "source": [
+ "## Passing in Optional File Loaders\n",
+ "\n",
+ "When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "94207e39",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import GoogleDriveLoader\n",
+ "from langchain.document_loaders import UnstructuredFileIOLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a15fbee0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
+ "loader = GoogleDriveLoader(\n",
+ " file_ids=[file_id],\n",
+ " file_loader_cls=UnstructuredFileIOLoader,\n",
+ " file_loader_kwargs={\"mode\": \"elements\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "98410bda",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "e3e72221",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '
\\n \\n \\n Team | \\n Location | \\n Stanley Cups | \\n
\\n \\n Blues | \\n STL | \\n 1 | \\n
\\n \\n Flyers | \\n PHI | \\n 2 | \\n
\\n \\n Maple Leafs | \\n TOR | \\n 13 | \\n
\\n \\n
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "238cd06f",
+ "metadata": {},
+ "source": [
+ "You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "0e2d093f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
+ "loader = GoogleDriveLoader(\n",
+ " folder_id=folder_id,\n",
+ " file_loader_cls=UnstructuredFileIOLoader,\n",
+ " file_loader_kwargs={\"mode\": \"elements\"}\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "b35ddcc6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = loader.load()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "3cc141e0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '\\n \\n \\n Team | \\n Location | \\n Stanley Cups | \\n
\\n \\n Blues | \\n STL | \\n 1 | \\n
\\n \\n Flyers | \\n PHI | \\n 2 | \\n
\\n \\n Maple Leafs | \\n TOR | \\n 13 | \\n
\\n \\n
', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e312268a",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -117,7 +244,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.8.13"
}
},
"nbformat": 4,
diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py
index c28bf705bc..5e21d401ed 100644
--- a/langchain/document_loaders/googledrive.py
+++ b/langchain/document_loaders/googledrive.py
@@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
recursive: bool = False
file_types: Optional[Sequence[str]] = None
load_trashed_files: bool = False
+ # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
+ # results in pydantic validation errors
+ file_loader_cls: Any = None
+ file_loader_kwargs: Dict["str", Any] = {}
@root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
@@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
returns.append(self._load_document_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
- elif file["mimeType"] == "application/pdf":
+ elif (
+ file["mimeType"] == "application/pdf"
+ or self.file_loader_cls is not None
+ ):
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
else:
pass
@@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
done = False
while done is False:
status, done = downloader.next_chunk()
- content = fh.getvalue()
- from PyPDF2 import PdfReader
+ if self.file_loader_cls is not None:
+ fh.seek(0)
+ loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
+ docs = loader.load()
+ for doc in docs:
+ doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
+ return docs
- pdf_reader = PdfReader(BytesIO(content))
+ else:
+ from PyPDF2 import PdfReader
- return [
- Document(
- page_content=page.extract_text(),
- metadata={
- "source": f"https://drive.google.com/file/d/{id}/view",
- "title": f"{file.get('name')}",
- "page": i,
- },
- )
- for i, page in enumerate(pdf_reader.pages)
- ]
+ content = fh.getvalue()
+ pdf_reader = PdfReader(BytesIO(content))
+
+ return [
+ Document(
+ page_content=page.extract_text(),
+ metadata={
+ "source": f"https://drive.google.com/file/d/{id}/view",
+ "title": f"{file.get('name')}",
+ "page": i,
+ },
+ )
+ for i, page in enumerate(pdf_reader.pages)
+ ]
def _load_file_from_ids(self) -> List[Document]:
"""Load files from a list of IDs."""