forked from Archives/langchain
Harrison/gdrive enhancements (#6375)
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
ebfffaa38f
commit
a8cb9ee013
@ -1,7 +1,6 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
|
||||
"metadata": {},
|
||||
@ -78,7 +77,6 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "2721ba8a",
|
||||
"metadata": {},
|
||||
@ -99,6 +97,135 @@
|
||||
" recursive=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d6b80931",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Passing in Optional File Loaders\n",
|
||||
"\n",
|
||||
"When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "94207e39",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import GoogleDriveLoader\n",
|
||||
"from langchain.document_loaders import UnstructuredFileIOLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a15fbee0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
|
||||
"loader = GoogleDriveLoader(\n",
|
||||
" file_ids=[file_id],\n",
|
||||
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "98410bda",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e3e72221",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "238cd06f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "0e2d093f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
|
||||
"loader = GoogleDriveLoader(\n",
|
||||
" folder_id=folder_id,\n",
|
||||
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b35ddcc6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3cc141e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e312268a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -117,7 +244,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
"version": "3.8.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
recursive: bool = False
|
||||
file_types: Optional[Sequence[str]] = None
|
||||
load_trashed_files: bool = False
|
||||
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
||||
# results in pydantic validation errors
|
||||
file_loader_cls: Any = None
|
||||
file_loader_kwargs: Dict["str", Any] = {}
|
||||
|
||||
@root_validator
|
||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
||||
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
|
||||
elif file["mimeType"] == "application/pdf":
|
||||
elif (
|
||||
file["mimeType"] == "application/pdf"
|
||||
or self.file_loader_cls is not None
|
||||
):
|
||||
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
|
||||
else:
|
||||
pass
|
||||
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
done = False
|
||||
while done is False:
|
||||
status, done = downloader.next_chunk()
|
||||
content = fh.getvalue()
|
||||
|
||||
from PyPDF2 import PdfReader
|
||||
if self.file_loader_cls is not None:
|
||||
fh.seek(0)
|
||||
loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
|
||||
return docs
|
||||
|
||||
pdf_reader = PdfReader(BytesIO(content))
|
||||
else:
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={
|
||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||
"title": f"{file.get('name')}",
|
||||
"page": i,
|
||||
},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
content = fh.getvalue()
|
||||
pdf_reader = PdfReader(BytesIO(content))
|
||||
|
||||
return [
|
||||
Document(
|
||||
page_content=page.extract_text(),
|
||||
metadata={
|
||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||
"title": f"{file.get('name')}",
|
||||
"page": i,
|
||||
},
|
||||
)
|
||||
for i, page in enumerate(pdf_reader.pages)
|
||||
]
|
||||
|
||||
def _load_file_from_ids(self) -> List[Document]:
|
||||
"""Load files from a list of IDs."""
|
||||
|
Loading…
Reference in New Issue
Block a user