From 410bf37fb8ab4e34a1d87a331b56669b0a78773a Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 28 Mar 2023 08:17:22 -0700 Subject: [PATCH] Harrison/big query (#2100) Co-authored-by: lu-cashmoney --- .../document_loaders/examples/bigquery.ipynb | 202 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/bigquery.py | 57 +++++ .../document_loaders/test_bigquery.py | 50 +++++ 4 files changed, 311 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/bigquery.ipynb create mode 100644 langchain/document_loaders/bigquery.py create mode 100644 tests/integration_tests/document_loaders/test_bigquery.py diff --git a/docs/modules/indexes/document_loaders/examples/bigquery.ipynb b/docs/modules/indexes/document_loaders/examples/bigquery.ipynb new file mode 100644 index 00000000..f34f0f17 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/bigquery.ipynb @@ -0,0 +1,202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery Loader\n", + "\n", + "Load a BigQuery query with one document per row." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import BigQueryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_QUERY = '''\n", + "SELECT\n", + " id,\n", + " dna_sequence,\n", + " organism\n", + "FROM (\n", + " SELECT\n", + " ARRAY (\n", + " SELECT\n", + " AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", + " UNION ALL\n", + " SELECT\n", + " AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", + " UNION ALL\n", + " SELECT\n", + " AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", + " UNNEST(new_array)\n", + "'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic Usage" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "loader = BigQueryLoader(BASE_QUERY)\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={}, lookup_index=0)]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specifying Which Columns are Content vs Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "loader = BigQueryLoader(BASE_QUERY, page_content_columns=[\"dna_sequence\", \"organism\"], metadata_columns=[\"id\"])\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='dna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).', lookup_str='', metadata={'id': 1}, lookup_index=0), Document(page_content='dna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).', lookup_str='', metadata={'id': 2}, lookup_index=0), Document(page_content='dna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).', lookup_str='', metadata={'id': 3}, lookup_index=0)]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding Source to Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Note that the `id` column is being returned twice, with one instance aliased as `source`\n", + "ALIASED_QUERY = '''\n", + "SELECT\n", + " id,\n", + " dna_sequence,\n", + " organism,\n", + " id as source\n", + "FROM (\n", + " SELECT\n", + " ARRAY (\n", + " SELECT\n", + " AS STRUCT 1 AS id, \"ATTCGA\" AS dna_sequence, \"Lokiarchaeum sp. (strain GC14_75).\" AS organism\n", + " UNION ALL\n", + " SELECT\n", + " AS STRUCT 2 AS id, \"AGGCGA\" AS dna_sequence, \"Heimdallarchaeota archaeon (strain LC_2).\" AS organism\n", + " UNION ALL\n", + " SELECT\n", + " AS STRUCT 3 AS id, \"TCCGGA\" AS dna_sequence, \"Acidianus hospitalis (strain W1).\" AS organism) AS new_array),\n", + " UNNEST(new_array)\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "loader = BigQueryLoader(ALIASED_QUERY, metadata_columns=[\"source\"])\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='id: 1\\ndna_sequence: ATTCGA\\norganism: Lokiarchaeum sp. (strain GC14_75).\\nsource: 1', lookup_str='', metadata={'source': 1}, lookup_index=0), Document(page_content='id: 2\\ndna_sequence: AGGCGA\\norganism: Heimdallarchaeota archaeon (strain LC_2).\\nsource: 2', lookup_str='', metadata={'source': 2}, lookup_index=0), Document(page_content='id: 3\\ndna_sequence: TCCGGA\\norganism: Acidianus hospitalis (strain W1).\\nsource: 3', lookup_str='', metadata={'source': 3}, lookup_index=0)]\n" + ] + } + ], + "source": [ + "print(data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 92f17c83..b2976fe1 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -8,6 +8,7 @@ from langchain.document_loaders.azure_blob_storage_container import ( from langchain.document_loaders.azure_blob_storage_file import ( AzureBlobStorageFileLoader, ) +from langchain.document_loaders.bigquery import BigQueryLoader from langchain.document_loaders.blackboard import BlackboardLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.conllu import CoNLLULoader @@ -122,4 +123,5 @@ __all__ = [ "AzureBlobStorageContainerLoader", "SitemapLoader", "DuckDBLoader", + "BigQueryLoader", ] diff --git a/langchain/document_loaders/bigquery.py b/langchain/document_loaders/bigquery.py new file mode 100644 index 00000000..b9e1fe1d --- /dev/null +++ b/langchain/document_loaders/bigquery.py @@ -0,0 +1,57 @@ +from typing import List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class BigQueryLoader(BaseLoader): + """Loads a query result from BigQuery into a list of documents. + + Each document represents one row of the result. The `page_content_columns` + are written into the `page_content` of the document. The `metadata_columns` + are written into the `metadata` of the document. By default, all columns + are written into the `page_content` and none into the `metadata`. + """ + + def __init__( + self, + query: str, + project: Optional[str] = None, + page_content_columns: Optional[List[str]] = None, + metadata_columns: Optional[List[str]] = None, + ): + self.query = query + self.project = project + self.page_content_columns = page_content_columns + self.metadata_columns = metadata_columns + + def load(self) -> List[Document]: + try: + from google.cloud import bigquery + except ImportError as ex: + raise ValueError( + "Could not import google-cloud-bigquery python package. " + "Please install it with `pip install google-cloud-bigquery`." + ) from ex + + bq_client = bigquery.Client(self.project) + query_result = bq_client.query(self.query).result() + docs: List[Document] = [] + + page_content_columns = self.page_content_columns + metadata_columns = self.metadata_columns + + if page_content_columns is None: + page_content_columns = [column.name for column in query_result.schema] + if metadata_columns is None: + metadata_columns = [] + + for row in query_result: + page_content = "\n".join( + f"{k}: {v}" for k, v in row.items() if k in page_content_columns + ) + metadata = {k: v for k, v in row.items() if k in metadata_columns} + doc = Document(page_content=page_content, metadata=metadata) + docs.append(doc) + + return docs diff --git a/tests/integration_tests/document_loaders/test_bigquery.py b/tests/integration_tests/document_loaders/test_bigquery.py new file mode 100644 index 00000000..24b55056 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_bigquery.py @@ -0,0 +1,50 @@ +import pytest + +from langchain.document_loaders.bigquery import BigQueryLoader + +try: + from google.cloud import bigquery # noqa: F401 + + bigquery_installed = True +except ImportError: + bigquery_installed = False + + +@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") +def test_bigquery_loader_no_options() -> None: + loader = BigQueryLoader("SELECT 1 AS a, 2 AS b") + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == "a: 1\nb: 2" + assert docs[0].metadata == {} + + +@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") +def test_bigquery_loader_page_content_columns() -> None: + loader = BigQueryLoader( + "SELECT 1 AS a, 2 AS b UNION ALL SELECT 3 AS a, 4 AS b", + page_content_columns=["a"], + ) + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].page_content == "a: 1" + assert docs[0].metadata == {} + + assert docs[1].page_content == "a: 3" + assert docs[1].metadata == {} + + +@pytest.mark.skipif(not bigquery_installed, reason="bigquery not installed") +def test_bigquery_loader_metadata_columns() -> None: + loader = BigQueryLoader( + "SELECT 1 AS a, 2 AS b", + page_content_columns=["a"], + metadata_columns=["b"], + ) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == "a: 1" + assert docs[0].metadata == {"b": 2}