From 96dc180883c94bb36238b5964f08be14fd51ae30 Mon Sep 17 00:00:00 2001 From: Hugoberry Date: Mon, 25 Mar 2024 07:02:35 +0000 Subject: [PATCH] community[minor]: Add `DuckDB` as a vectorstore (#18916) DuckDB has a cosine similarity function along list and array data types, which can be used as a vector store. - **Description:** The latest version of DuckDB features a cosine similarity function, which can be used with its support for list or array column types. This PR surfaces this functionality to langchain. - **Dependencies:** duckdb 0.10.0 - **Twitter handle:** @igocrite --------- Co-authored-by: Eugene Yurtsev Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> --- .../integrations/vectorstores/duckdb.ipynb | 108 +++++++ .../vectorstores/__init__.py | 1 + .../vectorstores/duckdb.py | 263 ++++++++++++++++++ .../vectorstores/test_duckdb.py | 160 +++++++++++ .../vectorstores/test_public_api.py | 1 + 5 files changed, 533 insertions(+) create mode 100644 docs/docs/integrations/vectorstores/duckdb.ipynb create mode 100644 libs/community/langchain_community/vectorstores/duckdb.py create mode 100644 libs/community/tests/integration_tests/vectorstores/test_duckdb.py diff --git a/docs/docs/integrations/vectorstores/duckdb.ipynb b/docs/docs/integrations/vectorstores/duckdb.ipynb new file mode 100644 index 0000000000..6a62fe218f --- /dev/null +++ b/docs/docs/integrations/vectorstores/duckdb.ipynb @@ -0,0 +1,108 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DuckDB\n", + "This notebook shows how to use `DuckDB` as a vector store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install duckdb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.vectorstores import DuckDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "documents = CharacterTextSplitter().split_documents(documents)\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docsearch = DuckDB.from_documents(documents, embeddings)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index aa4f4d8980..3632643948 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -51,6 +51,7 @@ _module_lookup = { "DocArrayHnswSearch": "langchain_community.vectorstores.docarray", "DocArrayInMemorySearch": "langchain_community.vectorstores.docarray", "DocumentDBVectorSearch": "langchain_community.vectorstores.documentdb", + "DuckDB": "langchain_community.vectorstores.duckdb", "ElasticKnnSearch": "langchain_community.vectorstores.elastic_vector_search", "ElasticVectorSearch": "langchain_community.vectorstores.elastic_vector_search", "ElasticsearchStore": "langchain_community.vectorstores.elasticsearch", diff --git a/libs/community/langchain_community/vectorstores/duckdb.py b/libs/community/langchain_community/vectorstores/duckdb.py new file mode 100644 index 0000000000..dd3b1611e8 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/duckdb.py @@ -0,0 +1,263 @@ +# mypy: disable-error-code=func-returns-value +from __future__ import annotations + +import json +import uuid +from typing import Any, Iterable, List, Optional, Type + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VST, VectorStore + + +class DuckDB(VectorStore): + """`DuckDB` vector store. + + This class provides a vector store interface for adding texts and performing + similarity searches using DuckDB. + + For more information about DuckDB, see: https://duckdb.org/ + + This integration requires the `duckdb` Python package. + You can install it with `pip install duckdb`. + + *Security Notice*: The default DuckDB configuration is not secure. + + By **default**, DuckDB can interact with files across the entire file system, + which includes abilities to read, write, and list files and directories. + It can also access some python variables present in the global namespace. + + When using this DuckDB vectorstore, we suggest that you initialize the + DuckDB connection with a secure configuration. + + For example, you can set `enable_external_access` to `false` in the connection + configuration to disable external access to the DuckDB connection. + + You can view the DuckDB configuration options here: + + https://duckdb.org/docs/configuration/overview.html + + Please review other relevant security considerations in the DuckDB + documentation. (e.g., "autoinstall_known_extensions": "false", + "autoload_known_extensions": "false") + + See https://python.langchain.com/docs/security for more information. + + Args: + connection: Optional DuckDB connection + embedding: The embedding function or model to use for generating embeddings. + vector_key: The column name for storing vectors. Defaults to `embedding`. + id_key: The column name for storing unique identifiers. Defaults to `id`. + text_key: The column name for storing text. Defaults to `text`. + table_name: The name of the table to use for storing embeddings. Defaults to + `embeddings`. + + Example: + .. code-block:: python + + import duckdb + conn = duckdb.connect(database=':memory:', + config={ + # Sample configuration to restrict some DuckDB capabilities + # List is not exhaustive. Please review DuckDB documentation. + "enable_external_access": "false", + "autoinstall_known_extensions": "false", + "autoload_known_extensions": "false" + } + ) + embedding_function = ... # Define or import your embedding function here + vector_store = DuckDB(conn, embedding_function) + vector_store.add_texts(['text1', 'text2']) + result = vector_store.similarity_search('text1') + """ + + def __init__( + self, + *, + connection: Optional[Any] = None, + embedding: Embeddings, + vector_key: str = "embedding", + id_key: str = "id", + text_key: str = "text", + table_name: str = "vectorstore", + ): + """Initialize with DuckDB connection and setup for vector storage.""" + try: + import duckdb + except ImportError: + raise ImportError( + "Could not import duckdb package. " + "Please install it with `pip install duckdb`." + ) + self.duckdb = duckdb + self._embedding = embedding + self._vector_key = vector_key + self._id_key = id_key + self._text_key = text_key + self._table_name = table_name + + if self._embedding is None: + raise ValueError("An embedding function or model must be provided.") + + if connection is None: + import warnings + + warnings.warn( + "No DuckDB connection provided. A new connection will be created." + "This connection is running in memory and no data will be persisted." + "To persist data, specify `connection=duckdb.connect(...)` when using " + "the API. Please review the documentation of the vectorstore for " + "security recommendations on configuring the connection." + ) + + self._connection = connection or self.duckdb.connect( + database=":memory:", config={"enable_external_access": "false"} + ) + self._ensure_table() + self._table = self._connection.table(self._table_name) + + @property + def embeddings(self) -> Optional[Embeddings]: + """Returns the embedding object used by the vector store.""" + return self._embedding + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Turn texts into embedding and add it to the database using Pandas DataFrame + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: Additional parameters including optional 'ids' to associate + with the texts. + + Returns: + List of ids of the added texts. + """ + + # Extract ids from kwargs or generate new ones if not provided + ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts]) + + # Embed texts and create documents + ids = ids or [str(uuid.uuid4()) for _ in texts] + embeddings = self._embedding.embed_documents(list(texts)) + for idx, text in enumerate(texts): + embedding = embeddings[idx] + # Serialize metadata if present, else default to None + metadata = ( + json.dumps(metadatas[idx]) + if metadatas and idx < len(metadatas) + else None + ) + self._connection.execute( + f"INSERT INTO {self._table_name} VALUES (?,?,?,?)", + [ids[idx], text, embedding, metadata], + ) + return ids + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Performs a similarity search for a given query string. + + Args: + query: The query string to search for. + k: The number of similar texts to return. + + Returns: + A list of Documents most similar to the query. + """ + embedding = self._embedding.embed_query(query) # type: ignore + list_cosine_similarity = self.duckdb.FunctionExpression( + "list_cosine_similarity", + self.duckdb.ColumnExpression(self._vector_key), + self.duckdb.ConstantExpression(embedding), + ) + docs = ( + self._table.select( + *[ + self.duckdb.StarExpression(exclude=[]), + list_cosine_similarity.alias("similarity"), + ] + ) + .order("similarity desc") + .limit(k) + .select( + self.duckdb.StarExpression(exclude=["similarity", self._vector_key]) + ) + .fetchdf() + ) + return [ + Document( + page_content=docs[self._text_key][idx], + metadata=json.loads(docs["metadata"][idx]) + if docs["metadata"][idx] + else {}, + ) + for idx in range(len(docs)) + ] + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> DuckDB: + """Creates an instance of DuckDB and populates it with texts and + their embeddings. + + Args: + texts: List of strings to add to the vector store. + embedding: The embedding function or model to use for generating embeddings. + metadatas: Optional list of metadata dictionaries associated with the texts. + **kwargs: Additional keyword arguments including: + - connection: DuckDB connection. If not provided, a new connection will + be created. + - vector_key: The column name for storing vectors. Default "vector". + - id_key: The column name for storing unique identifiers. Default "id". + - text_key: The column name for storing text. Defaults to "text". + - table_name: The name of the table to use for storing embeddings. + Defaults to "embeddings". + + Returns: + An instance of DuckDB with the provided texts and their embeddings added. + """ + + # Extract kwargs for DuckDB instance creation + connection = kwargs.get("connection", None) + vector_key = kwargs.get("vector_key", "vector") + id_key = kwargs.get("id_key", "id") + text_key = kwargs.get("text_key", "text") + table_name = kwargs.get("table_name", "embeddings") + + # Create an instance of DuckDB + instance = DuckDB( + connection=connection, + embedding=embedding, + vector_key=vector_key, + id_key=id_key, + text_key=text_key, + table_name=table_name, + ) + # Add texts and their embeddings to the DuckDB vector store + instance.add_texts(texts, metadatas=metadatas, **kwargs) + + return instance + + def _ensure_table(self) -> None: + """Ensures the table for storing embeddings exists.""" + create_table_sql = f""" + CREATE TABLE IF NOT EXISTS {self._table_name} ( + {self._id_key} VARCHAR PRIMARY KEY, + {self._text_key} VARCHAR, + {self._vector_key} FLOAT[], + metadata VARCHAR + ) + """ + self._connection.execute(create_table_sql) diff --git a/libs/community/tests/integration_tests/vectorstores/test_duckdb.py b/libs/community/tests/integration_tests/vectorstores/test_duckdb.py new file mode 100644 index 0000000000..b724dcf054 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_duckdb.py @@ -0,0 +1,160 @@ +from typing import Dict, Iterator, List +from uuid import uuid4 + +import duckdb +import pytest + +from langchain_community.vectorstores import DuckDB +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +@pytest.fixture +def duckdb_connection() -> Iterator[duckdb.DuckDBPyConnection]: + # Setup a temporary DuckDB database + conn = duckdb.connect(":memory:") + yield conn + conn.close() + + +@pytest.fixture +def embeddings() -> FakeEmbeddings: + return FakeEmbeddings() + + +@pytest.fixture +def texts() -> List[str]: + return ["text 1", "text 2", "item 3"] + + +@pytest.fixture +def metadatas() -> List[Dict[str, str]]: + return [ + {"source": "Document 1"}, + {"source": "Document 2"}, + {"source": "Document 3"}, + ] + + +@pytest.mark.requires("duckdb") +def test_duckdb_with_connection( + duckdb_connection: duckdb.DuckDBPyConnection, + embeddings: FakeEmbeddings, + texts: List[str], +) -> None: + store = DuckDB( + connection=duckdb_connection, embedding=embeddings, table_name="test_table" + ) + store.add_texts(texts) + result = store.similarity_search("text 1") + result_texts = [doc.page_content for doc in result] + assert "text 1" in result_texts + + +@pytest.mark.requires("duckdb") +def test_duckdb_without_connection( + embeddings: FakeEmbeddings, texts: List[str] +) -> None: + store = DuckDB(embedding=embeddings, table_name="test_table") + store.add_texts(texts) + result = store.similarity_search("text 1") + result_texts = [doc.page_content for doc in result] + assert "text 1" in result_texts + + +@pytest.mark.requires("duckdb") +def test_duckdb_add_texts(embeddings: FakeEmbeddings) -> None: + store = DuckDB(embedding=embeddings, table_name="test_table") + store.add_texts(["text 2"]) + result = store.similarity_search("text 2") + result_texts = [doc.page_content for doc in result] + assert "text 2" in result_texts + + +@pytest.mark.requires("duckdb") +def test_duckdb_add_texts_with_metadata( + duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings +) -> None: + store = DuckDB( + connection=duckdb_connection, + embedding=embeddings, + table_name="test_table_with_metadata", + ) + texts = ["text with metadata 1", "text with metadata 2"] + metadatas = [ + {"author": "Author 1", "date": "2021-01-01"}, + {"author": "Author 2", "date": "2021-02-01"}, + ] + + # Add texts along with their metadata + store.add_texts(texts, metadatas=metadatas) + + # Perform a similarity search to retrieve the documents + result = store.similarity_search("text with metadata", k=2) + + # Check if the metadata is correctly associated with the texts + assert len(result) == 2, "Should return two results" + assert ( + result[0].metadata.get("author") == "Author 1" + ), "Metadata for Author 1 should be correctly retrieved" + assert ( + result[0].metadata.get("date") == "2021-01-01" + ), "Date for Author 1 should be correctly retrieved" + assert ( + result[1].metadata.get("author") == "Author 2" + ), "Metadata for Author 2 should be correctly retrieved" + assert ( + result[1].metadata.get("date") == "2021-02-01" + ), "Date for Author 2 should be correctly retrieved" + + +@pytest.mark.requires("duckdb") +def test_duckdb_add_texts_with_predefined_ids( + duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings +) -> None: + store = DuckDB( + connection=duckdb_connection, + embedding=embeddings, + table_name="test_table_predefined_ids", + ) + texts = ["unique text 1", "unique text 2"] + predefined_ids = [str(uuid4()), str(uuid4())] # Generate unique IDs + + # Add texts with the predefined IDs + store.add_texts(texts, ids=predefined_ids) + + # Perform a similarity search for each text and check if it's found + for text in texts: + result = store.similarity_search(text) + + found_texts = [doc.page_content for doc in result] + assert ( + text in found_texts + ), f"Text '{text}' was not found in the search results." + + +@pytest.mark.requires("duckdb") +def test_duckdb_from_texts( + duckdb_connection: duckdb.DuckDBPyConnection, + embeddings: FakeEmbeddings, + texts: List[str], + metadatas: List[Dict[str, str]], +) -> None: + # Initialize DuckDB from texts using the from_texts class method + store = DuckDB.from_texts( + texts=texts, + embedding=embeddings, + metadatas=metadatas, + connection=duckdb_connection, + table_name="test_from_texts_table", + ) + + # Perform a similarity search to retrieve the documents + query_text = "sample text" + result = store.similarity_search(query_text, k=2) + + # Verify that the vector store was populated and can return results + assert len(result) > 0, "Should return at least one result" + + # Optionally, check that metadata is correctly associated with the texts + for doc in result: + assert "source" in doc.metadata, "Document metadata should include 'source' key" diff --git a/libs/community/tests/unit_tests/vectorstores/test_public_api.py b/libs/community/tests/unit_tests/vectorstores/test_public_api.py index ce06b08161..7ad31825e9 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py +++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py @@ -28,6 +28,7 @@ _EXPECTED = [ "DocArrayHnswSearch", "DocArrayInMemorySearch", "DocumentDBVectorSearch", + "DuckDB", "ElasticKnnSearch", "ElasticVectorSearch", "ElasticsearchStore",