{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# BigQuery Loader\n", "\n", "Load a BigQuery query with one document per row." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import BigQueryLoader" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "BASE_QUERY = '''\n", "SELECT\n", " id,\n", " dna_sequence,\n", " organism\n", "FROM (\n", " SELECT\n", " ARRAY (\n", " SELECT\n", " AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", " UNION ALL\n", " SELECT\n", " AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", " UNION ALL\n", " SELECT\n", " AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", " UNNEST(new_array)\n", "'''" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic Usage" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "loader = BigQueryLoader(BASE_QUERY)\n", "\n", "data = loader.load()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={}, lookup_index=0)]\n" ] } ], "source": [ "print(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Specifying Which Columns are Content vs Metadata" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "loader = BigQueryLoader(BASE_QUERY, page_content_columns=[\"dna_sequence\", \"organism\"], metadata_columns=[\"id\"])\n", "\n", "data = loader.load()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Document(page_content='dna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={'id': 1}, lookup_index=0), Document(page_content='dna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={'id': 2}, lookup_index=0), Document(page_content='dna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={'id': 3}, lookup_index=0)]\n" ] } ], "source": [ "print(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Adding Source to Metadata" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Note that the `id` column is being returned twice, with one instance aliased as `source`\n", "ALIASED_QUERY = '''\n", "SELECT\n", " id,\n", " dna_sequence,\n", " organism,\n", " id as source\n", "FROM (\n", " SELECT\n", " ARRAY (\n", " SELECT\n", " AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", " UNION ALL\n", " SELECT\n", " AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", " UNION ALL\n", " SELECT\n", " AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", " UNNEST(new_array)\n", "'''" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "loader = BigQueryLoader(ALIASED_QUERY, metadata_columns=[\"source\"])\n", "\n", "data = loader.load()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).\\nsource: 1', lookup_str='', metadata={'source': 1}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).\\nsource: 2', lookup_str='', metadata={'source': 2}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).\\nsource: 3', lookup_str='', metadata={'source': 3}, lookup_index=0)]\n" ] } ], "source": [ "print(data)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.1" } }, "nbformat": 4, "nbformat_minor": 2 }