From f74a1bebf5294f4781c3884de4266e9d865b087d Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 27 Mar 2023 19:51:34 -0700 Subject: [PATCH] Harrison/duckdb (#2064) Co-authored-by: Trent Hauck --- .../document_loaders/examples/duckdb.ipynb | 175 ++++++++++++++++++ langchain/document_loaders/__init__.py | 4 +- langchain/document_loaders/duckdb_loader.py | 74 ++++++++ .../document_loaders/test_duckdb.py | 56 ++++++ 4 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 docs/modules/indexes/document_loaders/examples/duckdb.ipynb create mode 100644 langchain/document_loaders/duckdb_loader.py create mode 100644 tests/integration_tests/document_loaders/test_duckdb.py diff --git a/docs/modules/indexes/document_loaders/examples/duckdb.ipynb b/docs/modules/indexes/document_loaders/examples/duckdb.ipynb new file mode 100644 index 00000000..b842f0c7 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/duckdb.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DuckDB Loader\n", + "\n", + "Load a DuckDB query with one document per row." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import DuckDBLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing example.csv\n" + ] + } + ], + "source": [ + "%%file example.csv\n", + "Team,Payroll\n", + "Nationals,81.34\n", + "Reds,82.20" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "loader = DuckDBLoader(\"SELECT * FROM read_csv_auto('example.csv')\")\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nPayroll: 81.34', metadata={}), Document(page_content='Team: Reds\\nPayroll: 82.2', metadata={})]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specifying Which Columns are Content vs Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "loader = DuckDBLoader(\n", + " \"SELECT * FROM read_csv_auto('example.csv')\",\n", + " page_content_columns=[\"Team\"],\n", + " metadata_columns=[\"Payroll\"]\n", + ")\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals', metadata={'Payroll': 81.34}), Document(page_content='Team: Reds', metadata={'Payroll': 82.2})]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding Source to Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "loader = DuckDBLoader(\n", + " \"SELECT Team, Payroll, Team As source FROM read_csv_auto('example.csv')\",\n", + " metadata_columns=[\"source\"]\n", + ")\n", + "\n", + "data = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nPayroll: 81.34\\nsource: Nationals', metadata={'source': 'Nationals'}), Document(page_content='Team: Reds\\nPayroll: 82.2\\nsource: Reds', metadata={'source': 'Reds'})]\n" + ] + } + ], + "source": [ + "print(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index ae895139..43420ef2 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -14,6 +14,7 @@ from langchain.document_loaders.conllu import CoNLLULoader from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders.dataframe import DataFrameLoader from langchain.document_loaders.directory import DirectoryLoader +from langchain.document_loaders.duckdb_loader import DuckDBLoader from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.facebook_chat import FacebookChatLoader @@ -61,7 +62,7 @@ from langchain.document_loaders.youtube import ( YoutubeLoader, ) -"""Legacy: only for backwards compat. use PyPDFLoader instead""" +# Legacy: only for backwards compat. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader __all__ = [ @@ -116,4 +117,5 @@ __all__ = [ "AzureBlobStorageFileLoader", "AzureBlobStorageContainerLoader", "SitemapLoader", + "DuckDBLoader", ] diff --git a/langchain/document_loaders/duckdb_loader.py b/langchain/document_loaders/duckdb_loader.py new file mode 100644 index 00000000..bd841874 --- /dev/null +++ b/langchain/document_loaders/duckdb_loader.py @@ -0,0 +1,74 @@ +from typing import Dict, List, Optional, cast + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class DuckDBLoader(BaseLoader): + """Loads a query result from DuckDB into a list of documents. + + Each document represents one row of the result. The `page_content_columns` + are written into the `page_content` of the document. The `metadata_columns` + are written into the `metadata` of the document. By default, all columns + are written into the `page_content` and none into the `metadata`. + """ + + def __init__( + self, + query: str, + database: str = ":memory:", + read_only: bool = False, + config: Optional[Dict[str, str]] = None, + page_content_columns: Optional[List[str]] = None, + metadata_columns: Optional[List[str]] = None, + ): + self.query = query + self.database = database + self.read_only = read_only + self.config = config or {} + self.page_content_columns = page_content_columns + self.metadata_columns = metadata_columns + + def load(self) -> List[Document]: + try: + import duckdb + except ImportError: + raise ValueError( + "Could not import duckdb python package. " + "Please it install it with `pip install duckdb`." + ) + + docs = [] + with duckdb.connect( + database=self.database, read_only=self.read_only, config=self.config + ) as con: + query_result = con.execute(self.query) + results = query_result.fetchall() + description = cast(list, query_result.description) + field_names = [c[0] for c in description] + + if self.page_content_columns is None: + page_content_columns = field_names + else: + page_content_columns = self.page_content_columns + + if self.metadata_columns is None: + metadata_columns = [] + else: + metadata_columns = self.metadata_columns + + for result in results: + page_content = "\n".join( + f"{column}: {result[field_names.index(column)]}" + for column in page_content_columns + ) + + metadata = { + column: result[field_names.index(column)] + for column in metadata_columns + } + + doc = Document(page_content=page_content, metadata=metadata) + docs.append(doc) + + return docs diff --git a/tests/integration_tests/document_loaders/test_duckdb.py b/tests/integration_tests/document_loaders/test_duckdb.py new file mode 100644 index 00000000..a91e352b --- /dev/null +++ b/tests/integration_tests/document_loaders/test_duckdb.py @@ -0,0 +1,56 @@ +import unittest + +from langchain.document_loaders.duckdb_loader import DuckDBLoader + +try: + import duckdb # noqa: F401 + + duckdb_installed = True +except ImportError: + duckdb_installed = False + + +@unittest.skipIf(not duckdb_installed, "duckdb not installed") +def test_duckdb_loader_no_options() -> None: + """Test DuckDB loader.""" + + loader = DuckDBLoader("SELECT 1 AS a, 2 AS b") + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == "a: 1\nb: 2" + assert docs[0].metadata == {} + + +@unittest.skipIf(not duckdb_installed, "duckdb not installed") +def test_duckdb_loader_page_content_columns() -> None: + """Test DuckDB loader.""" + + loader = DuckDBLoader( + "SELECT 1 AS a, 2 AS b UNION SELECT 3 AS a, 4 AS b", + page_content_columns=["a"], + ) + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].page_content == "a: 1" + assert docs[0].metadata == {} + + assert docs[1].page_content == "a: 3" + assert docs[1].metadata == {} + + +@unittest.skipIf(not duckdb_installed, "duckdb not installed") +def test_duckdb_loader_metadata_columns() -> None: + """Test DuckDB loader.""" + + loader = DuckDBLoader( + "SELECT 1 AS a, 2 AS b", + page_content_columns=["a"], + metadata_columns=["b"], + ) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].page_content == "a: 1" + assert docs[0].metadata == {"b": 2}