Harrison/gdrive enhancements (#6375)

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
Harrison Chase 2023-06-18 11:07:23 -07:00 committed by GitHub
parent ebfffaa38f
commit a8cb9ee013
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 161 additions and 18 deletions

View File

@ -1,7 +1,6 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8", "id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {}, "metadata": {},
@ -78,7 +77,6 @@
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "2721ba8a", "id": "2721ba8a",
"metadata": {}, "metadata": {},
@ -99,6 +97,135 @@
" recursive=False\n", " recursive=False\n",
")" ")"
] ]
},
{
"cell_type": "markdown",
"id": "d6b80931",
"metadata": {},
"source": [
"## Passing in Optional File Loaders\n",
"\n",
"When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "94207e39",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import GoogleDriveLoader\n",
"from langchain.document_loaders import UnstructuredFileIOLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a15fbee0",
"metadata": {},
"outputs": [],
"source": [
"file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
"loader = GoogleDriveLoader(\n",
" file_ids=[file_id],\n",
" file_loader_cls=UnstructuredFileIOLoader,\n",
" file_loader_kwargs={\"mode\": \"elements\"}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "98410bda",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e3e72221",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "markdown",
"id": "238cd06f",
"metadata": {},
"source": [
"You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0e2d093f",
"metadata": {},
"outputs": [],
"source": [
"folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
"loader = GoogleDriveLoader(\n",
" folder_id=folder_id,\n",
" file_loader_cls=UnstructuredFileIOLoader,\n",
" file_loader_kwargs={\"mode\": \"elements\"}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b35ddcc6",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3cc141e0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e312268a",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -117,7 +244,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.6" "version": "3.8.13"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
recursive: bool = False recursive: bool = False
file_types: Optional[Sequence[str]] = None file_types: Optional[Sequence[str]] = None
load_trashed_files: bool = False load_trashed_files: bool = False
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
# results in pydantic validation errors
file_loader_cls: Any = None
file_loader_kwargs: Dict["str", Any] = {}
@root_validator @root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
returns.append(self._load_document_from_id(file["id"])) # type: ignore returns.append(self._load_document_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet": elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
elif file["mimeType"] == "application/pdf": elif (
file["mimeType"] == "application/pdf"
or self.file_loader_cls is not None
):
returns.extend(self._load_file_from_id(file["id"])) # type: ignore returns.extend(self._load_file_from_id(file["id"])) # type: ignore
else: else:
pass pass
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
done = False done = False
while done is False: while done is False:
status, done = downloader.next_chunk() status, done = downloader.next_chunk()
content = fh.getvalue()
from PyPDF2 import PdfReader if self.file_loader_cls is not None:
fh.seek(0)
loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
docs = loader.load()
for doc in docs:
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
return docs
pdf_reader = PdfReader(BytesIO(content)) else:
from PyPDF2 import PdfReader
return [ content = fh.getvalue()
Document( pdf_reader = PdfReader(BytesIO(content))
page_content=page.extract_text(),
metadata={ return [
"source": f"https://drive.google.com/file/d/{id}/view", Document(
"title": f"{file.get('name')}", page_content=page.extract_text(),
"page": i, metadata={
}, "source": f"https://drive.google.com/file/d/{id}/view",
) "title": f"{file.get('name')}",
for i, page in enumerate(pdf_reader.pages) "page": i,
] },
)
for i, page in enumerate(pdf_reader.pages)
]
def _load_file_from_ids(self) -> List[Document]: def _load_file_from_ids(self) -> List[Document]:
"""Load files from a list of IDs.""" """Load files from a list of IDs."""