mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/gdrive enhancements (#6375)
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
ebfffaa38f
commit
a8cb9ee013
@ -1,7 +1,6 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
|
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -78,7 +77,6 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "2721ba8a",
|
"id": "2721ba8a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -99,6 +97,135 @@
|
|||||||
" recursive=False\n",
|
" recursive=False\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "d6b80931",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Passing in Optional File Loaders\n",
|
||||||
|
"\n",
|
||||||
|
"When processing files other than Google Docs and Google Sheets, it can be helpful to pass an optional file loader to `GoogleDriveLoader`. If you pass in a file loader, that file loader will be used on documents that do not have a Google Docs or Google Sheets MIME type. Here is an example of how to load an Excel document from Google Drive using a file loader. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "94207e39",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import GoogleDriveLoader\n",
|
||||||
|
"from langchain.document_loaders import UnstructuredFileIOLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "a15fbee0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"file_id=\"1x9WBtFPWMEAdjcJzPScRsjpjQvpSo_kz\"\n",
|
||||||
|
"loader = GoogleDriveLoader(\n",
|
||||||
|
" file_ids=[file_id],\n",
|
||||||
|
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||||
|
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "98410bda",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "e3e72221",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "238cd06f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"You can also process a folder with a mix of files and Google Docs/Sheets using the following pattern:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "0e2d093f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"folder_id=\"1asMOHY1BqBS84JcRbOag5LOJac74gpmD\"\n",
|
||||||
|
"loader = GoogleDriveLoader(\n",
|
||||||
|
" folder_id=folder_id,\n",
|
||||||
|
" file_loader_cls=UnstructuredFileIOLoader,\n",
|
||||||
|
" file_loader_kwargs={\"mode\": \"elements\"}\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "b35ddcc6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = loader.load()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "3cc141e0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Document(page_content='\\n \\n \\n Team\\n Location\\n Stanley Cups\\n \\n \\n Blues\\n STL\\n 1\\n \\n \\n Flyers\\n PHI\\n 2\\n \\n \\n Maple Leafs\\n TOR\\n 13\\n \\n \\n', metadata={'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'page_number': 1, 'page_name': 'Stanley Cups', 'text_as_html': '<table border=\"1\" class=\"dataframe\">\\n <tbody>\\n <tr>\\n <td>Team</td>\\n <td>Location</td>\\n <td>Stanley Cups</td>\\n </tr>\\n <tr>\\n <td>Blues</td>\\n <td>STL</td>\\n <td>1</td>\\n </tr>\\n <tr>\\n <td>Flyers</td>\\n <td>PHI</td>\\n <td>2</td>\\n </tr>\\n <tr>\\n <td>Maple Leafs</td>\\n <td>TOR</td>\\n <td>13</td>\\n </tr>\\n </tbody>\\n</table>', 'category': 'Table', 'source': 'https://drive.google.com/file/d/1aA6L2AR3g0CR-PW03HEZZo4NaVlKpaP7/view'})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e312268a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -117,7 +244,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.8.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -33,6 +33,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
recursive: bool = False
|
recursive: bool = False
|
||||||
file_types: Optional[Sequence[str]] = None
|
file_types: Optional[Sequence[str]] = None
|
||||||
load_trashed_files: bool = False
|
load_trashed_files: bool = False
|
||||||
|
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
||||||
|
# results in pydantic validation errors
|
||||||
|
file_loader_cls: Any = None
|
||||||
|
file_loader_kwargs: Dict["str", Any] = {}
|
||||||
|
|
||||||
@root_validator
|
@root_validator
|
||||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
@ -231,7 +235,10 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
returns.append(self._load_document_from_id(file["id"])) # type: ignore
|
||||||
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
elif file["mimeType"] == "application/vnd.google-apps.spreadsheet":
|
||||||
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
|
returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore
|
||||||
elif file["mimeType"] == "application/pdf":
|
elif (
|
||||||
|
file["mimeType"] == "application/pdf"
|
||||||
|
or self.file_loader_cls is not None
|
||||||
|
):
|
||||||
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
|
returns.extend(self._load_file_from_id(file["id"])) # type: ignore
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
@ -287,23 +294,32 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
|
|||||||
done = False
|
done = False
|
||||||
while done is False:
|
while done is False:
|
||||||
status, done = downloader.next_chunk()
|
status, done = downloader.next_chunk()
|
||||||
content = fh.getvalue()
|
|
||||||
|
|
||||||
from PyPDF2 import PdfReader
|
if self.file_loader_cls is not None:
|
||||||
|
fh.seek(0)
|
||||||
|
loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs)
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view"
|
||||||
|
return docs
|
||||||
|
|
||||||
pdf_reader = PdfReader(BytesIO(content))
|
else:
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
return [
|
content = fh.getvalue()
|
||||||
Document(
|
pdf_reader = PdfReader(BytesIO(content))
|
||||||
page_content=page.extract_text(),
|
|
||||||
metadata={
|
return [
|
||||||
"source": f"https://drive.google.com/file/d/{id}/view",
|
Document(
|
||||||
"title": f"{file.get('name')}",
|
page_content=page.extract_text(),
|
||||||
"page": i,
|
metadata={
|
||||||
},
|
"source": f"https://drive.google.com/file/d/{id}/view",
|
||||||
)
|
"title": f"{file.get('name')}",
|
||||||
for i, page in enumerate(pdf_reader.pages)
|
"page": i,
|
||||||
]
|
},
|
||||||
|
)
|
||||||
|
for i, page in enumerate(pdf_reader.pages)
|
||||||
|
]
|
||||||
|
|
||||||
def _load_file_from_ids(self) -> List[Document]:
|
def _load_file_from_ids(self) -> List[Document]:
|
||||||
"""Load files from a list of IDs."""
|
"""Load files from a list of IDs."""
|
||||||
|
Loading…
Reference in New Issue
Block a user