From 2e96704d59479e18f6e574b2ac624427bed3f9d0 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 10 Feb 2023 18:08:00 -0800 Subject: [PATCH] Harrison/airbyte (#989) Co-authored-by: zanderchase Co-authored-by: Harrison Chase --- .../examples/airbyte_json.ipynb | 171 ++++++++++++++++++ .../document_loaders/how_to_guides.rst | 2 + langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/airbyte_json.py | 41 +++++ 4 files changed, 216 insertions(+) create mode 100644 docs/modules/document_loaders/examples/airbyte_json.ipynb create mode 100644 langchain/document_loaders/airbyte_json.py diff --git a/docs/modules/document_loaders/examples/airbyte_json.ipynb b/docs/modules/document_loaders/examples/airbyte_json.ipynb new file mode 100644 index 0000000000..c7a1678d8d --- /dev/null +++ b/docs/modules/document_loaders/examples/airbyte_json.ipynb @@ -0,0 +1,171 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1f3a5ebf", + "metadata": {}, + "source": [ + "# Airbyte JSON\n", + "This covers how to load any source from Airbyte into a local JSON file that can be read in as a document\n", + "\n", + "Prereqs:\n", + "Have docker desktop installed\n", + "\n", + "Steps:\n", + "\n", + "1) clone Airbyte from GitHub - `git clone https://github.com/airbytehq/airbyte.git`\n", + "\n", + "2) switch into Airbyte directory - `cd airbyte`\n", + "\n", + "3) start Airbyte - `docker compose up`\n", + "\n", + "4) In your browser, just visit http://localhost:8000. You will be asked for a username and password. By default, that's username `airbyte` and password `password`.\n", + "\n", + "5) Setup any source you wish\n", + "\n", + "6) Set destination as Local JSON, with specified destination path - lets say `/json_data`. Set up manual sync.\n", + "\n", + "7) Run the connection!\n", + "\n", + "7) To see what files are create, you can navigate to: `file:///tmp/airbyte_local`\n", + "\n", + "8) Find your data and copy path. That path should be saved in the file variable below. It should start with `/tmp/airbyte_local`\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "180c8b74", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AirbyteJSONLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4af10665", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_airbyte_raw_pokemon.jsonl\r\n" + ] + } + ], + "source": [ + "!ls /tmp/airbyte_local/json_data/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "721d9316", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AirbyteJSONLoader('/tmp/airbyte_local/json_data/_airbyte_raw_pokemon.jsonl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9858b946", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fca024cb", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "abilities: \n", + "ability: \n", + "name: blaze\n", + "url: https://pokeapi.co/api/v2/ability/66/\n", + "\n", + "is_hidden: False\n", + "slot: 1\n", + "\n", + "\n", + "ability: \n", + "name: solar-power\n", + "url: https://pokeapi.co/api/v2/ability/94/\n", + "\n", + "is_hidden: True\n", + "slot: 3\n", + "\n", + "base_experience: 267\n", + "forms: \n", + "name: charizard\n", + "url: https://pokeapi.co/api/v2/pokemon-form/6/\n", + "\n", + "game_indices: \n", + "game_index: 180\n", + "version: \n", + "name: red\n", + "url: https://pokeapi.co/api/v2/version/1/\n", + "\n", + "\n", + "\n", + "game_index: 180\n", + "version: \n", + "name: blue\n", + "url: https://pokeapi.co/api/v2/version/2/\n", + "\n", + "\n", + "\n", + "game_index: 180\n", + "version: \n", + "n\n" + ] + } + ], + "source": [ + "print(data[0].page_content[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fa002a5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index d4d0b83df6..2dcd05ebb3 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -49,6 +49,8 @@ There are a lot of different document loaders that LangChain supports. Below are `Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text. +`Airbyte Json <./examples/airbyte_json.html>`_: A walkthrough of how to load data from a local Airbyte JSON file. + `Online PDF <./examples/online_pdf.html>`_: A walkthrough of how to load data from an online PDF. .. toctree:: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index a42babc355..13bc5700a9 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,5 +1,6 @@ """All different types of document loaders.""" +from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.azlyrics import AZLyricsLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.directory import DirectoryLoader @@ -53,5 +54,6 @@ __all__ = [ "GutenbergLoader", "PagedPDFSplitter", "EveryNoteLoader", + "AirbyteJSONLoader", "OnlinePDFLoader", ] diff --git a/langchain/document_loaders/airbyte_json.py b/langchain/document_loaders/airbyte_json.py new file mode 100644 index 0000000000..823267e631 --- /dev/null +++ b/langchain/document_loaders/airbyte_json.py @@ -0,0 +1,41 @@ +"""Loader that loads local airbyte json files.""" +import json +from typing import Any, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def _stringify_value(val: Any) -> str: + if isinstance(val, str): + return val + elif isinstance(val, dict): + return "\n" + _stringify_dict(val) + elif isinstance(val, list): + return "\n".join(_stringify_value(v) for v in val) + else: + return str(val) + + +def _stringify_dict(data: dict) -> str: + text = "" + for key, value in data.items(): + text += key + ": " + _stringify_value(data[key]) + "\n" + return text + + +class AirbyteJSONLoader(BaseLoader): + """Loader that loads local airbyte json files.""" + + def __init__(self, file_path: str): + """Initialize with file path. This should start with '/tmp/airbyte_local/'.""" + self.file_path = file_path + + def load(self) -> List[Document]: + """Load file.""" + text = "" + for line in open(self.file_path, "r"): + data = json.loads(line)["_airbyte_data"] + text += _stringify_dict(data) + metadata = {"source": self.file_path} + return [Document(page_content=text, metadata=metadata)]