community[minor]: Add `DuckDB` as a vectorstore (#18916)

DuckDB has a cosine similarity function along list and array data types, which can be used as a vector store. - **Description:** The latest version of DuckDB features a cosine similarity function, which can be used with its support for list or array column types. This PR surfaces this functionality to langchain. - **Dependencies:** duckdb 0.10.0 - **Twitter handle:** @igocrite --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
2 months ago · 96dc180883
parent fa6397d76a
commit 96dc180883
5 changed files with 533 additions and 0 deletions
--- a/docs/docs/integrations/vectorstores/duckdb.ipynb
+++ b/docs/docs/integrations/vectorstores/duckdb.ipynb
@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# DuckDB\n",
+    "This notebook shows how to use `DuckDB` as a vector store."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install duckdb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import DuckDB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import TextLoader\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "\n",
+    "loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
+    "documents = loader.load()\n",
+    "\n",
+    "documents = CharacterTextSplitter().split_documents(documents)\n",
+    "embeddings = OpenAIEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docsearch = DuckDB.from_documents(documents, embeddings)\n",
+    "\n",
+    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
+    "docs = docsearch.similarity_search(query)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(docs[0].page_content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/libs/community/langchain_community/vectorstores/init.py
+++ b/libs/community/langchain_community/vectorstores/init.py
@ -51,6 +51,7 @@ _module_lookup = {
    "DocArrayHnswSearch": "langchain_community.vectorstores.docarray",
    "DocArrayInMemorySearch": "langchain_community.vectorstores.docarray",
    "DocumentDBVectorSearch": "langchain_community.vectorstores.documentdb",
+    "DuckDB": "langchain_community.vectorstores.duckdb",
    "ElasticKnnSearch": "langchain_community.vectorstores.elastic_vector_search",
    "ElasticVectorSearch": "langchain_community.vectorstores.elastic_vector_search",
    "ElasticsearchStore": "langchain_community.vectorstores.elasticsearch",
--- a/libs/community/langchain_community/vectorstores/duckdb.py
+++ b/libs/community/langchain_community/vectorstores/duckdb.py
@ -0,0 +1,263 @@
+# mypy: disable-error-code=func-returns-value
+from __future__ import annotations
+
+import json
+import uuid
+from typing import Any, Iterable, List, Optional, Type
+
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.vectorstores import VST, VectorStore
+
+
+class DuckDB(VectorStore):
+    """`DuckDB` vector store.
+
+    This class provides a vector store interface for adding texts and performing
+    similarity searches using DuckDB.
+
+    For more information about DuckDB, see: https://duckdb.org/
+
+    This integration requires the `duckdb` Python package.
+    You can install it with `pip install duckdb`.
+
+    *Security Notice*: The default DuckDB configuration is not secure.
+
+        By **default**, DuckDB can interact with files across the entire file system,
+        which includes abilities to read, write, and list files and directories.
+        It can also access some python variables present in the global namespace.
+
+        When using this DuckDB vectorstore, we suggest that you initialize the
+        DuckDB connection with a secure configuration.
+
+        For example, you can set `enable_external_access` to `false` in the connection
+        configuration to disable external access to the DuckDB connection.
+
+        You can view the DuckDB configuration options here:
+
+        https://duckdb.org/docs/configuration/overview.html
+
+        Please review other relevant security considerations in the DuckDB
+        documentation. (e.g., "autoinstall_known_extensions": "false",
+        "autoload_known_extensions": "false")
+
+        See https://python.langchain.com/docs/security for more information.
+
+    Args:
+        connection: Optional DuckDB connection
+        embedding: The embedding function or model to use for generating embeddings.
+        vector_key: The column name for storing vectors. Defaults to `embedding`.
+        id_key: The column name for storing unique identifiers. Defaults to `id`.
+        text_key: The column name for storing text. Defaults to `text`.
+        table_name: The name of the table to use for storing embeddings. Defaults to
+          `embeddings`.
+
+    Example:
+        .. code-block:: python
+
+            import duckdb
+            conn = duckdb.connect(database=':memory:',
+                config={
+                    # Sample configuration to restrict some DuckDB capabilities
+                    # List is not exhaustive. Please review DuckDB documentation.
+                        "enable_external_access": "false",
+                        "autoinstall_known_extensions": "false",
+                        "autoload_known_extensions": "false"
+                    }
+            )
+            embedding_function = ... # Define or import your embedding function here
+            vector_store = DuckDB(conn, embedding_function)
+            vector_store.add_texts(['text1', 'text2'])
+            result = vector_store.similarity_search('text1')
+    """
+
+    def __init__(
+        self,
+        *,
+        connection: Optional[Any] = None,
+        embedding: Embeddings,
+        vector_key: str = "embedding",
+        id_key: str = "id",
+        text_key: str = "text",
+        table_name: str = "vectorstore",
+    ):
+        """Initialize with DuckDB connection and setup for vector storage."""
+        try:
+            import duckdb
+        except ImportError:
+            raise ImportError(
+                "Could not import duckdb package. "
+                "Please install it with `pip install duckdb`."
+            )
+        self.duckdb = duckdb
+        self._embedding = embedding
+        self._vector_key = vector_key
+        self._id_key = id_key
+        self._text_key = text_key
+        self._table_name = table_name
+
+        if self._embedding is None:
+            raise ValueError("An embedding function or model must be provided.")
+
+        if connection is None:
+            import warnings
+
+            warnings.warn(
+                "No DuckDB connection provided. A new connection will be created."
+                "This connection is running in memory and no data will be persisted."
+                "To persist data, specify `connection=duckdb.connect(...)` when using "
+                "the API. Please review the documentation of the vectorstore for "
+                "security recommendations on configuring the connection."
+            )
+
+        self._connection = connection or self.duckdb.connect(
+            database=":memory:", config={"enable_external_access": "false"}
+        )
+        self._ensure_table()
+        self._table = self._connection.table(self._table_name)
+
+    @property
+    def embeddings(self) -> Optional[Embeddings]:
+        """Returns the embedding object used by the vector store."""
+        return self._embedding
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn texts into embedding and add it to the database using Pandas DataFrame
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            kwargs: Additional parameters including optional 'ids' to associate
+              with the texts.
+
+        Returns:
+            List of ids of the added texts.
+        """
+
+        # Extract ids from kwargs or generate new ones if not provided
+        ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
+
+        # Embed texts and create documents
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_documents(list(texts))
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            # Serialize metadata if present, else default to None
+            metadata = (
+                json.dumps(metadatas[idx])
+                if metadatas and idx < len(metadatas)
+                else None
+            )
+            self._connection.execute(
+                f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
+                [ids[idx], text, embedding, metadata],
+            )
+        return ids
+
+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        """Performs a similarity search for a given query string.
+
+        Args:
+            query: The query string to search for.
+            k: The number of similar texts to return.
+
+        Returns:
+            A list of Documents most similar to the query.
+        """
+        embedding = self._embedding.embed_query(query)  # type: ignore
+        list_cosine_similarity = self.duckdb.FunctionExpression(
+            "list_cosine_similarity",
+            self.duckdb.ColumnExpression(self._vector_key),
+            self.duckdb.ConstantExpression(embedding),
+        )
+        docs = (
+            self._table.select(
+                *[
+                    self.duckdb.StarExpression(exclude=[]),
+                    list_cosine_similarity.alias("similarity"),
+                ]
+            )
+            .order("similarity desc")
+            .limit(k)
+            .select(
+                self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
+            )
+            .fetchdf()
+        )
+        return [
+            Document(
+                page_content=docs[self._text_key][idx],
+                metadata=json.loads(docs["metadata"][idx])
+                if docs["metadata"][idx]
+                else {},
+            )
+            for idx in range(len(docs))
+        ]
+
+    @classmethod
+    def from_texts(
+        cls: Type[VST],
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> DuckDB:
+        """Creates an instance of DuckDB and populates it with texts and
+          their embeddings.
+
+        Args:
+            texts: List of strings to add to the vector store.
+            embedding: The embedding function or model to use for generating embeddings.
+            metadatas: Optional list of metadata dictionaries associated with the texts.
+            **kwargs: Additional keyword arguments including:
+                - connection: DuckDB connection. If not provided, a new connection will
+                  be created.
+                - vector_key: The column name for storing vectors. Default "vector".
+                - id_key: The column name for storing unique identifiers. Default "id".
+                - text_key: The column name for storing text. Defaults to "text".
+                - table_name: The name of the table to use for storing embeddings.
+                    Defaults to "embeddings".
+
+        Returns:
+            An instance of DuckDB with the provided texts and their embeddings added.
+        """
+
+        # Extract kwargs for DuckDB instance creation
+        connection = kwargs.get("connection", None)
+        vector_key = kwargs.get("vector_key", "vector")
+        id_key = kwargs.get("id_key", "id")
+        text_key = kwargs.get("text_key", "text")
+        table_name = kwargs.get("table_name", "embeddings")
+
+        # Create an instance of DuckDB
+        instance = DuckDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            table_name=table_name,
+        )
+        # Add texts and their embeddings to the DuckDB vector store
+        instance.add_texts(texts, metadatas=metadatas, **kwargs)
+
+        return instance
+
+    def _ensure_table(self) -> None:
+        """Ensures the table for storing embeddings exists."""
+        create_table_sql = f"""
+        CREATE TABLE IF NOT EXISTS {self._table_name} (
+            {self._id_key} VARCHAR PRIMARY KEY,
+            {self._text_key} VARCHAR,
+            {self._vector_key} FLOAT[],
+            metadata VARCHAR
+        )
+        """
+        self._connection.execute(create_table_sql)
--- a/libs/community/tests/integration_tests/vectorstores/test_duckdb.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_duckdb.py
@ -0,0 +1,160 @@
+from typing import Dict, Iterator, List
+from uuid import uuid4
+
+import duckdb
+import pytest
+
+from langchain_community.vectorstores import DuckDB
+from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+
+
+@pytest.fixture
+def duckdb_connection() -> Iterator[duckdb.DuckDBPyConnection]:
+    # Setup a temporary DuckDB database
+    conn = duckdb.connect(":memory:")
+    yield conn
+    conn.close()
+
+
+@pytest.fixture
+def embeddings() -> FakeEmbeddings:
+    return FakeEmbeddings()
+
+
+@pytest.fixture
+def texts() -> List[str]:
+    return ["text 1", "text 2", "item 3"]
+
+
+@pytest.fixture
+def metadatas() -> List[Dict[str, str]]:
+    return [
+        {"source": "Document 1"},
+        {"source": "Document 2"},
+        {"source": "Document 3"},
+    ]
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_with_connection(
+    duckdb_connection: duckdb.DuckDBPyConnection,
+    embeddings: FakeEmbeddings,
+    texts: List[str],
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection, embedding=embeddings, table_name="test_table"
+    )
+    store.add_texts(texts)
+    result = store.similarity_search("text 1")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 1" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_without_connection(
+    embeddings: FakeEmbeddings, texts: List[str]
+) -> None:
+    store = DuckDB(embedding=embeddings, table_name="test_table")
+    store.add_texts(texts)
+    result = store.similarity_search("text 1")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 1" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts(embeddings: FakeEmbeddings) -> None:
+    store = DuckDB(embedding=embeddings, table_name="test_table")
+    store.add_texts(["text 2"])
+    result = store.similarity_search("text 2")
+    result_texts = [doc.page_content for doc in result]
+    assert "text 2" in result_texts
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts_with_metadata(
+    duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection,
+        embedding=embeddings,
+        table_name="test_table_with_metadata",
+    )
+    texts = ["text with metadata 1", "text with metadata 2"]
+    metadatas = [
+        {"author": "Author 1", "date": "2021-01-01"},
+        {"author": "Author 2", "date": "2021-02-01"},
+    ]
+
+    # Add texts along with their metadata
+    store.add_texts(texts, metadatas=metadatas)
+
+    # Perform a similarity search to retrieve the documents
+    result = store.similarity_search("text with metadata", k=2)
+
+    # Check if the metadata is correctly associated with the texts
+    assert len(result) == 2, "Should return two results"
+    assert (
+        result[0].metadata.get("author") == "Author 1"
+    ), "Metadata for Author 1 should be correctly retrieved"
+    assert (
+        result[0].metadata.get("date") == "2021-01-01"
+    ), "Date for Author 1 should be correctly retrieved"
+    assert (
+        result[1].metadata.get("author") == "Author 2"
+    ), "Metadata for Author 2 should be correctly retrieved"
+    assert (
+        result[1].metadata.get("date") == "2021-02-01"
+    ), "Date for Author 2 should be correctly retrieved"
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_add_texts_with_predefined_ids(
+    duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
+) -> None:
+    store = DuckDB(
+        connection=duckdb_connection,
+        embedding=embeddings,
+        table_name="test_table_predefined_ids",
+    )
+    texts = ["unique text 1", "unique text 2"]
+    predefined_ids = [str(uuid4()), str(uuid4())]  # Generate unique IDs
+
+    # Add texts with the predefined IDs
+    store.add_texts(texts, ids=predefined_ids)
+
+    # Perform a similarity search for each text and check if it's found
+    for text in texts:
+        result = store.similarity_search(text)
+
+        found_texts = [doc.page_content for doc in result]
+        assert (
+            text in found_texts
+        ), f"Text '{text}' was not found in the search results."
+
+
+@pytest.mark.requires("duckdb")
+def test_duckdb_from_texts(
+    duckdb_connection: duckdb.DuckDBPyConnection,
+    embeddings: FakeEmbeddings,
+    texts: List[str],
+    metadatas: List[Dict[str, str]],
+) -> None:
+    # Initialize DuckDB from texts using the from_texts class method
+    store = DuckDB.from_texts(
+        texts=texts,
+        embedding=embeddings,
+        metadatas=metadatas,
+        connection=duckdb_connection,
+        table_name="test_from_texts_table",
+    )
+
+    # Perform a similarity search to retrieve the documents
+    query_text = "sample text"
+    result = store.similarity_search(query_text, k=2)
+
+    # Verify that the vector store was populated and can return results
+    assert len(result) > 0, "Should return at least one result"
+
+    # Optionally, check that metadata is correctly associated with the texts
+    for doc in result:
+        assert "source" in doc.metadata, "Document metadata should include 'source' key"
--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@ -28,6 +28,7 @@ _EXPECTED = [
    "DocArrayHnswSearch",
    "DocArrayInMemorySearch",
    "DocumentDBVectorSearch",
+    "DuckDB",
    "ElasticKnnSearch",
    "ElasticVectorSearch",
    "ElasticsearchStore",