community[minor]: Add `DuckDB` as a vectorstore (#18916)

DuckDB has a cosine similarity function along list and array data types,
which can be used as a vector store.
- **Description:** The latest version of DuckDB features a cosine
similarity function, which can be used with its support for list or
array column types. This PR surfaces this functionality to langchain.
    - **Dependencies:** duckdb 0.10.0
    - **Twitter handle:** @igocrite

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
pull/19470/head^2
Hugoberry 2 months ago committed by GitHub
parent fa6397d76a
commit 96dc180883
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,108 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# DuckDB\n",
"This notebook shows how to use `DuckDB` as a vector store."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! pip install duckdb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings import OpenAIEmbeddings\n",
"from langchain.vectorstores import DuckDB"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
"documents = loader.load()\n",
"\n",
"documents = CharacterTextSplitter().split_documents(documents)\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"docsearch = DuckDB.from_documents(documents, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(docs[0].page_content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -51,6 +51,7 @@ _module_lookup = {
"DocArrayHnswSearch": "langchain_community.vectorstores.docarray",
"DocArrayInMemorySearch": "langchain_community.vectorstores.docarray",
"DocumentDBVectorSearch": "langchain_community.vectorstores.documentdb",
"DuckDB": "langchain_community.vectorstores.duckdb",
"ElasticKnnSearch": "langchain_community.vectorstores.elastic_vector_search",
"ElasticVectorSearch": "langchain_community.vectorstores.elastic_vector_search",
"ElasticsearchStore": "langchain_community.vectorstores.elasticsearch",

@ -0,0 +1,263 @@
# mypy: disable-error-code=func-returns-value
from __future__ import annotations
import json
import uuid
from typing import Any, Iterable, List, Optional, Type
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VST, VectorStore
class DuckDB(VectorStore):
"""`DuckDB` vector store.
This class provides a vector store interface for adding texts and performing
similarity searches using DuckDB.
For more information about DuckDB, see: https://duckdb.org/
This integration requires the `duckdb` Python package.
You can install it with `pip install duckdb`.
*Security Notice*: The default DuckDB configuration is not secure.
By **default**, DuckDB can interact with files across the entire file system,
which includes abilities to read, write, and list files and directories.
It can also access some python variables present in the global namespace.
When using this DuckDB vectorstore, we suggest that you initialize the
DuckDB connection with a secure configuration.
For example, you can set `enable_external_access` to `false` in the connection
configuration to disable external access to the DuckDB connection.
You can view the DuckDB configuration options here:
https://duckdb.org/docs/configuration/overview.html
Please review other relevant security considerations in the DuckDB
documentation. (e.g., "autoinstall_known_extensions": "false",
"autoload_known_extensions": "false")
See https://python.langchain.com/docs/security for more information.
Args:
connection: Optional DuckDB connection
embedding: The embedding function or model to use for generating embeddings.
vector_key: The column name for storing vectors. Defaults to `embedding`.
id_key: The column name for storing unique identifiers. Defaults to `id`.
text_key: The column name for storing text. Defaults to `text`.
table_name: The name of the table to use for storing embeddings. Defaults to
`embeddings`.
Example:
.. code-block:: python
import duckdb
conn = duckdb.connect(database=':memory:',
config={
# Sample configuration to restrict some DuckDB capabilities
# List is not exhaustive. Please review DuckDB documentation.
"enable_external_access": "false",
"autoinstall_known_extensions": "false",
"autoload_known_extensions": "false"
}
)
embedding_function = ... # Define or import your embedding function here
vector_store = DuckDB(conn, embedding_function)
vector_store.add_texts(['text1', 'text2'])
result = vector_store.similarity_search('text1')
"""
def __init__(
self,
*,
connection: Optional[Any] = None,
embedding: Embeddings,
vector_key: str = "embedding",
id_key: str = "id",
text_key: str = "text",
table_name: str = "vectorstore",
):
"""Initialize with DuckDB connection and setup for vector storage."""
try:
import duckdb
except ImportError:
raise ImportError(
"Could not import duckdb package. "
"Please install it with `pip install duckdb`."
)
self.duckdb = duckdb
self._embedding = embedding
self._vector_key = vector_key
self._id_key = id_key
self._text_key = text_key
self._table_name = table_name
if self._embedding is None:
raise ValueError("An embedding function or model must be provided.")
if connection is None:
import warnings
warnings.warn(
"No DuckDB connection provided. A new connection will be created."
"This connection is running in memory and no data will be persisted."
"To persist data, specify `connection=duckdb.connect(...)` when using "
"the API. Please review the documentation of the vectorstore for "
"security recommendations on configuring the connection."
)
self._connection = connection or self.duckdb.connect(
database=":memory:", config={"enable_external_access": "false"}
)
self._ensure_table()
self._table = self._connection.table(self._table_name)
@property
def embeddings(self) -> Optional[Embeddings]:
"""Returns the embedding object used by the vector store."""
return self._embedding
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Turn texts into embedding and add it to the database using Pandas DataFrame
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: Additional parameters including optional 'ids' to associate
with the texts.
Returns:
List of ids of the added texts.
"""
# Extract ids from kwargs or generate new ones if not provided
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
# Embed texts and create documents
ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self._embedding.embed_documents(list(texts))
for idx, text in enumerate(texts):
embedding = embeddings[idx]
# Serialize metadata if present, else default to None
metadata = (
json.dumps(metadatas[idx])
if metadatas and idx < len(metadatas)
else None
)
self._connection.execute(
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
[ids[idx], text, embedding, metadata],
)
return ids
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Performs a similarity search for a given query string.
Args:
query: The query string to search for.
k: The number of similar texts to return.
Returns:
A list of Documents most similar to the query.
"""
embedding = self._embedding.embed_query(query) # type: ignore
list_cosine_similarity = self.duckdb.FunctionExpression(
"list_cosine_similarity",
self.duckdb.ColumnExpression(self._vector_key),
self.duckdb.ConstantExpression(embedding),
)
docs = (
self._table.select(
*[
self.duckdb.StarExpression(exclude=[]),
list_cosine_similarity.alias("similarity"),
]
)
.order("similarity desc")
.limit(k)
.select(
self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
)
.fetchdf()
)
return [
Document(
page_content=docs[self._text_key][idx],
metadata=json.loads(docs["metadata"][idx])
if docs["metadata"][idx]
else {},
)
for idx in range(len(docs))
]
@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> DuckDB:
"""Creates an instance of DuckDB and populates it with texts and
their embeddings.
Args:
texts: List of strings to add to the vector store.
embedding: The embedding function or model to use for generating embeddings.
metadatas: Optional list of metadata dictionaries associated with the texts.
**kwargs: Additional keyword arguments including:
- connection: DuckDB connection. If not provided, a new connection will
be created.
- vector_key: The column name for storing vectors. Default "vector".
- id_key: The column name for storing unique identifiers. Default "id".
- text_key: The column name for storing text. Defaults to "text".
- table_name: The name of the table to use for storing embeddings.
Defaults to "embeddings".
Returns:
An instance of DuckDB with the provided texts and their embeddings added.
"""
# Extract kwargs for DuckDB instance creation
connection = kwargs.get("connection", None)
vector_key = kwargs.get("vector_key", "vector")
id_key = kwargs.get("id_key", "id")
text_key = kwargs.get("text_key", "text")
table_name = kwargs.get("table_name", "embeddings")
# Create an instance of DuckDB
instance = DuckDB(
connection=connection,
embedding=embedding,
vector_key=vector_key,
id_key=id_key,
text_key=text_key,
table_name=table_name,
)
# Add texts and their embeddings to the DuckDB vector store
instance.add_texts(texts, metadatas=metadatas, **kwargs)
return instance
def _ensure_table(self) -> None:
"""Ensures the table for storing embeddings exists."""
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {self._table_name} (
{self._id_key} VARCHAR PRIMARY KEY,
{self._text_key} VARCHAR,
{self._vector_key} FLOAT[],
metadata VARCHAR
)
"""
self._connection.execute(create_table_sql)

@ -0,0 +1,160 @@
from typing import Dict, Iterator, List
from uuid import uuid4
import duckdb
import pytest
from langchain_community.vectorstores import DuckDB
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
@pytest.fixture
def duckdb_connection() -> Iterator[duckdb.DuckDBPyConnection]:
# Setup a temporary DuckDB database
conn = duckdb.connect(":memory:")
yield conn
conn.close()
@pytest.fixture
def embeddings() -> FakeEmbeddings:
return FakeEmbeddings()
@pytest.fixture
def texts() -> List[str]:
return ["text 1", "text 2", "item 3"]
@pytest.fixture
def metadatas() -> List[Dict[str, str]]:
return [
{"source": "Document 1"},
{"source": "Document 2"},
{"source": "Document 3"},
]
@pytest.mark.requires("duckdb")
def test_duckdb_with_connection(
duckdb_connection: duckdb.DuckDBPyConnection,
embeddings: FakeEmbeddings,
texts: List[str],
) -> None:
store = DuckDB(
connection=duckdb_connection, embedding=embeddings, table_name="test_table"
)
store.add_texts(texts)
result = store.similarity_search("text 1")
result_texts = [doc.page_content for doc in result]
assert "text 1" in result_texts
@pytest.mark.requires("duckdb")
def test_duckdb_without_connection(
embeddings: FakeEmbeddings, texts: List[str]
) -> None:
store = DuckDB(embedding=embeddings, table_name="test_table")
store.add_texts(texts)
result = store.similarity_search("text 1")
result_texts = [doc.page_content for doc in result]
assert "text 1" in result_texts
@pytest.mark.requires("duckdb")
def test_duckdb_add_texts(embeddings: FakeEmbeddings) -> None:
store = DuckDB(embedding=embeddings, table_name="test_table")
store.add_texts(["text 2"])
result = store.similarity_search("text 2")
result_texts = [doc.page_content for doc in result]
assert "text 2" in result_texts
@pytest.mark.requires("duckdb")
def test_duckdb_add_texts_with_metadata(
duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
) -> None:
store = DuckDB(
connection=duckdb_connection,
embedding=embeddings,
table_name="test_table_with_metadata",
)
texts = ["text with metadata 1", "text with metadata 2"]
metadatas = [
{"author": "Author 1", "date": "2021-01-01"},
{"author": "Author 2", "date": "2021-02-01"},
]
# Add texts along with their metadata
store.add_texts(texts, metadatas=metadatas)
# Perform a similarity search to retrieve the documents
result = store.similarity_search("text with metadata", k=2)
# Check if the metadata is correctly associated with the texts
assert len(result) == 2, "Should return two results"
assert (
result[0].metadata.get("author") == "Author 1"
), "Metadata for Author 1 should be correctly retrieved"
assert (
result[0].metadata.get("date") == "2021-01-01"
), "Date for Author 1 should be correctly retrieved"
assert (
result[1].metadata.get("author") == "Author 2"
), "Metadata for Author 2 should be correctly retrieved"
assert (
result[1].metadata.get("date") == "2021-02-01"
), "Date for Author 2 should be correctly retrieved"
@pytest.mark.requires("duckdb")
def test_duckdb_add_texts_with_predefined_ids(
duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
) -> None:
store = DuckDB(
connection=duckdb_connection,
embedding=embeddings,
table_name="test_table_predefined_ids",
)
texts = ["unique text 1", "unique text 2"]
predefined_ids = [str(uuid4()), str(uuid4())] # Generate unique IDs
# Add texts with the predefined IDs
store.add_texts(texts, ids=predefined_ids)
# Perform a similarity search for each text and check if it's found
for text in texts:
result = store.similarity_search(text)
found_texts = [doc.page_content for doc in result]
assert (
text in found_texts
), f"Text '{text}' was not found in the search results."
@pytest.mark.requires("duckdb")
def test_duckdb_from_texts(
duckdb_connection: duckdb.DuckDBPyConnection,
embeddings: FakeEmbeddings,
texts: List[str],
metadatas: List[Dict[str, str]],
) -> None:
# Initialize DuckDB from texts using the from_texts class method
store = DuckDB.from_texts(
texts=texts,
embedding=embeddings,
metadatas=metadatas,
connection=duckdb_connection,
table_name="test_from_texts_table",
)
# Perform a similarity search to retrieve the documents
query_text = "sample text"
result = store.similarity_search(query_text, k=2)
# Verify that the vector store was populated and can return results
assert len(result) > 0, "Should return at least one result"
# Optionally, check that metadata is correctly associated with the texts
for doc in result:
assert "source" in doc.metadata, "Document metadata should include 'source' key"

@ -28,6 +28,7 @@ _EXPECTED = [
"DocArrayHnswSearch",
"DocArrayInMemorySearch",
"DocumentDBVectorSearch",
"DuckDB",
"ElasticKnnSearch",
"ElasticVectorSearch",
"ElasticsearchStore",

Loading…
Cancel
Save