From 21bd16bb59769eed908cbcb25d9f3e1f0d68de65 Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Sat, 10 Jun 2023 15:43:18 -0700 Subject: [PATCH] Create Airtable loader (#5958) Create document loader for Airtable --- docs/modules/indexes/document_loaders.rst | 1 + .../document_loaders/examples/airtable.ipynb | 142 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/airtable.py | 36 +++++ 4 files changed, 181 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/airtable.ipynb create mode 100644 langchain/document_loaders/airtable.py diff --git a/docs/modules/indexes/document_loaders.rst b/docs/modules/indexes/document_loaders.rst index 190abbef..df46be78 100644 --- a/docs/modules/indexes/document_loaders.rst +++ b/docs/modules/indexes/document_loaders.rst @@ -30,6 +30,7 @@ For detailed instructions on how to get set up with Unstructured, see installati :maxdepth: 1 :glob: + ./document_loaders/examples/airtable.ipynb ./document_loaders/examples/audio.ipynb ./document_loaders/examples/conll-u.ipynb ./document_loaders/examples/copypaste.ipynb diff --git a/docs/modules/indexes/document_loaders/examples/airtable.ipynb b/docs/modules/indexes/document_loaders/examples/airtable.ipynb new file mode 100644 index 00000000..decabe8e --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/airtable.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7ae421e6", + "metadata": {}, + "source": [ + "# Airtable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98aea00d", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install pyairtable" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "592483eb", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AirtableLoader" + ] + }, + { + "cell_type": "markdown", + "id": "637e1205", + "metadata": {}, + "source": [ + "* Get your API key [here](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens).\n", + "* Get ID of your base [here](https://airtable.com/developers/web/api/introduction).\n", + "* Get your table ID from the table url as shown [here](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c12a7aff", + "metadata": {}, + "outputs": [], + "source": [ + "api_key=\"xxx\"\n", + "base_id=\"xxx\"\n", + "table_id=\"xxx\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ccddd5a6", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AirtableLoader(api_key,table_id,base_id)\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "ae76c25c", + "metadata": {}, + "source": [ + "Returns each table row as `dict`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7abec7ce", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "403c95da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': 'recF3GbGZCuh9sXIQ',\n", + " 'createdTime': '2023-06-09T04:47:21.000Z',\n", + " 'fields': {'Priority': 'High',\n", + " 'Status': 'In progress',\n", + " 'Name': 'Document Splitters'}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eval(docs[0].page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 393062b7..d533159a 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,6 +1,7 @@ """All different types of document loaders.""" from langchain.document_loaders.airbyte_json import AirbyteJSONLoader +from langchain.document_loaders.airtable import AirtableLoader from langchain.document_loaders.apify_dataset import ApifyDatasetLoader from langchain.document_loaders.arxiv import ArxivLoader from langchain.document_loaders.azlyrics import AZLyricsLoader @@ -135,6 +136,7 @@ TelegramChatLoader = TelegramChatFileLoader __all__ = [ "AZLyricsLoader", "AirbyteJSONLoader", + "AirtableLoader", "ApifyDatasetLoader", "ArxivLoader", "AzureBlobStorageContainerLoader", diff --git a/langchain/document_loaders/airtable.py b/langchain/document_loaders/airtable.py new file mode 100644 index 00000000..3dfaf40c --- /dev/null +++ b/langchain/document_loaders/airtable.py @@ -0,0 +1,36 @@ +from typing import Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class AirtableLoader(BaseLoader): + """Loader that loads local airbyte json files.""" + + def __init__(self, api_token: str, table_id: str, base_id: str): + """Initialize with API token and the IDs for table and base""" + self.api_token = api_token + self.table_id = table_id + self.base_id = base_id + + def lazy_load(self) -> Iterator[Document]: + """Load Table.""" + + from pyairtable import Table + + table = Table(self.api_token, self.base_id, self.table_id) + records = table.all() + for record in records: + # Need to convert record from dict to str + yield Document( + page_content=str(record), + metadata={ + "source": self.base_id + "_" + self.table_id, + "base_id": self.base_id, + "table_id": self.table_id, + }, + ) + + def load(self) -> List[Document]: + """Load Table.""" + return list(self.lazy_load())