mirror of https://github.com/hwchase17/langchain
community[minor]: Add `DuckDB` as a vectorstore (#18916)
DuckDB has a cosine similarity function along list and array data types, which can be used as a vector store. - **Description:** The latest version of DuckDB features a cosine similarity function, which can be used with its support for list or array column types. This PR surfaces this functionality to langchain. - **Dependencies:** duckdb 0.10.0 - **Twitter handle:** @igocrite --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>pull/19470/head^2
parent
fa6397d76a
commit
96dc180883
@ -0,0 +1,108 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# DuckDB\n",
|
||||
"This notebook shows how to use `DuckDB` as a vector store."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install duckdb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We want to use OpenAIEmbeddings so we have to get the OpenAI API Key. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||
"from langchain.vectorstores import DuckDB"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"from langchain_text_splitters import CharacterTextSplitter\n",
|
||||
"\n",
|
||||
"loader = TextLoader(\"../../modules/state_of_the_union.txt\")\n",
|
||||
"documents = loader.load()\n",
|
||||
"\n",
|
||||
"documents = CharacterTextSplitter().split_documents(documents)\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docsearch = DuckDB.from_documents(documents, embeddings)\n",
|
||||
"\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
"docs = docsearch.similarity_search(query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,263 @@
|
||||
# mypy: disable-error-code=func-returns-value
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any, Iterable, List, Optional, Type
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.vectorstores import VST, VectorStore
|
||||
|
||||
|
||||
class DuckDB(VectorStore):
|
||||
"""`DuckDB` vector store.
|
||||
|
||||
This class provides a vector store interface for adding texts and performing
|
||||
similarity searches using DuckDB.
|
||||
|
||||
For more information about DuckDB, see: https://duckdb.org/
|
||||
|
||||
This integration requires the `duckdb` Python package.
|
||||
You can install it with `pip install duckdb`.
|
||||
|
||||
*Security Notice*: The default DuckDB configuration is not secure.
|
||||
|
||||
By **default**, DuckDB can interact with files across the entire file system,
|
||||
which includes abilities to read, write, and list files and directories.
|
||||
It can also access some python variables present in the global namespace.
|
||||
|
||||
When using this DuckDB vectorstore, we suggest that you initialize the
|
||||
DuckDB connection with a secure configuration.
|
||||
|
||||
For example, you can set `enable_external_access` to `false` in the connection
|
||||
configuration to disable external access to the DuckDB connection.
|
||||
|
||||
You can view the DuckDB configuration options here:
|
||||
|
||||
https://duckdb.org/docs/configuration/overview.html
|
||||
|
||||
Please review other relevant security considerations in the DuckDB
|
||||
documentation. (e.g., "autoinstall_known_extensions": "false",
|
||||
"autoload_known_extensions": "false")
|
||||
|
||||
See https://python.langchain.com/docs/security for more information.
|
||||
|
||||
Args:
|
||||
connection: Optional DuckDB connection
|
||||
embedding: The embedding function or model to use for generating embeddings.
|
||||
vector_key: The column name for storing vectors. Defaults to `embedding`.
|
||||
id_key: The column name for storing unique identifiers. Defaults to `id`.
|
||||
text_key: The column name for storing text. Defaults to `text`.
|
||||
table_name: The name of the table to use for storing embeddings. Defaults to
|
||||
`embeddings`.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
import duckdb
|
||||
conn = duckdb.connect(database=':memory:',
|
||||
config={
|
||||
# Sample configuration to restrict some DuckDB capabilities
|
||||
# List is not exhaustive. Please review DuckDB documentation.
|
||||
"enable_external_access": "false",
|
||||
"autoinstall_known_extensions": "false",
|
||||
"autoload_known_extensions": "false"
|
||||
}
|
||||
)
|
||||
embedding_function = ... # Define or import your embedding function here
|
||||
vector_store = DuckDB(conn, embedding_function)
|
||||
vector_store.add_texts(['text1', 'text2'])
|
||||
result = vector_store.similarity_search('text1')
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
connection: Optional[Any] = None,
|
||||
embedding: Embeddings,
|
||||
vector_key: str = "embedding",
|
||||
id_key: str = "id",
|
||||
text_key: str = "text",
|
||||
table_name: str = "vectorstore",
|
||||
):
|
||||
"""Initialize with DuckDB connection and setup for vector storage."""
|
||||
try:
|
||||
import duckdb
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import duckdb package. "
|
||||
"Please install it with `pip install duckdb`."
|
||||
)
|
||||
self.duckdb = duckdb
|
||||
self._embedding = embedding
|
||||
self._vector_key = vector_key
|
||||
self._id_key = id_key
|
||||
self._text_key = text_key
|
||||
self._table_name = table_name
|
||||
|
||||
if self._embedding is None:
|
||||
raise ValueError("An embedding function or model must be provided.")
|
||||
|
||||
if connection is None:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"No DuckDB connection provided. A new connection will be created."
|
||||
"This connection is running in memory and no data will be persisted."
|
||||
"To persist data, specify `connection=duckdb.connect(...)` when using "
|
||||
"the API. Please review the documentation of the vectorstore for "
|
||||
"security recommendations on configuring the connection."
|
||||
)
|
||||
|
||||
self._connection = connection or self.duckdb.connect(
|
||||
database=":memory:", config={"enable_external_access": "false"}
|
||||
)
|
||||
self._ensure_table()
|
||||
self._table = self._connection.table(self._table_name)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Optional[Embeddings]:
|
||||
"""Returns the embedding object used by the vector store."""
|
||||
return self._embedding
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Turn texts into embedding and add it to the database using Pandas DataFrame
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
kwargs: Additional parameters including optional 'ids' to associate
|
||||
with the texts.
|
||||
|
||||
Returns:
|
||||
List of ids of the added texts.
|
||||
"""
|
||||
|
||||
# Extract ids from kwargs or generate new ones if not provided
|
||||
ids = kwargs.pop("ids", [str(uuid.uuid4()) for _ in texts])
|
||||
|
||||
# Embed texts and create documents
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
embeddings = self._embedding.embed_documents(list(texts))
|
||||
for idx, text in enumerate(texts):
|
||||
embedding = embeddings[idx]
|
||||
# Serialize metadata if present, else default to None
|
||||
metadata = (
|
||||
json.dumps(metadatas[idx])
|
||||
if metadatas and idx < len(metadatas)
|
||||
else None
|
||||
)
|
||||
self._connection.execute(
|
||||
f"INSERT INTO {self._table_name} VALUES (?,?,?,?)",
|
||||
[ids[idx], text, embedding, metadata],
|
||||
)
|
||||
return ids
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Performs a similarity search for a given query string.
|
||||
|
||||
Args:
|
||||
query: The query string to search for.
|
||||
k: The number of similar texts to return.
|
||||
|
||||
Returns:
|
||||
A list of Documents most similar to the query.
|
||||
"""
|
||||
embedding = self._embedding.embed_query(query) # type: ignore
|
||||
list_cosine_similarity = self.duckdb.FunctionExpression(
|
||||
"list_cosine_similarity",
|
||||
self.duckdb.ColumnExpression(self._vector_key),
|
||||
self.duckdb.ConstantExpression(embedding),
|
||||
)
|
||||
docs = (
|
||||
self._table.select(
|
||||
*[
|
||||
self.duckdb.StarExpression(exclude=[]),
|
||||
list_cosine_similarity.alias("similarity"),
|
||||
]
|
||||
)
|
||||
.order("similarity desc")
|
||||
.limit(k)
|
||||
.select(
|
||||
self.duckdb.StarExpression(exclude=["similarity", self._vector_key])
|
||||
)
|
||||
.fetchdf()
|
||||
)
|
||||
return [
|
||||
Document(
|
||||
page_content=docs[self._text_key][idx],
|
||||
metadata=json.loads(docs["metadata"][idx])
|
||||
if docs["metadata"][idx]
|
||||
else {},
|
||||
)
|
||||
for idx in range(len(docs))
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VST],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> DuckDB:
|
||||
"""Creates an instance of DuckDB and populates it with texts and
|
||||
their embeddings.
|
||||
|
||||
Args:
|
||||
texts: List of strings to add to the vector store.
|
||||
embedding: The embedding function or model to use for generating embeddings.
|
||||
metadatas: Optional list of metadata dictionaries associated with the texts.
|
||||
**kwargs: Additional keyword arguments including:
|
||||
- connection: DuckDB connection. If not provided, a new connection will
|
||||
be created.
|
||||
- vector_key: The column name for storing vectors. Default "vector".
|
||||
- id_key: The column name for storing unique identifiers. Default "id".
|
||||
- text_key: The column name for storing text. Defaults to "text".
|
||||
- table_name: The name of the table to use for storing embeddings.
|
||||
Defaults to "embeddings".
|
||||
|
||||
Returns:
|
||||
An instance of DuckDB with the provided texts and their embeddings added.
|
||||
"""
|
||||
|
||||
# Extract kwargs for DuckDB instance creation
|
||||
connection = kwargs.get("connection", None)
|
||||
vector_key = kwargs.get("vector_key", "vector")
|
||||
id_key = kwargs.get("id_key", "id")
|
||||
text_key = kwargs.get("text_key", "text")
|
||||
table_name = kwargs.get("table_name", "embeddings")
|
||||
|
||||
# Create an instance of DuckDB
|
||||
instance = DuckDB(
|
||||
connection=connection,
|
||||
embedding=embedding,
|
||||
vector_key=vector_key,
|
||||
id_key=id_key,
|
||||
text_key=text_key,
|
||||
table_name=table_name,
|
||||
)
|
||||
# Add texts and their embeddings to the DuckDB vector store
|
||||
instance.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||
|
||||
return instance
|
||||
|
||||
def _ensure_table(self) -> None:
|
||||
"""Ensures the table for storing embeddings exists."""
|
||||
create_table_sql = f"""
|
||||
CREATE TABLE IF NOT EXISTS {self._table_name} (
|
||||
{self._id_key} VARCHAR PRIMARY KEY,
|
||||
{self._text_key} VARCHAR,
|
||||
{self._vector_key} FLOAT[],
|
||||
metadata VARCHAR
|
||||
)
|
||||
"""
|
||||
self._connection.execute(create_table_sql)
|
@ -0,0 +1,160 @@
|
||||
from typing import Dict, Iterator, List
|
||||
from uuid import uuid4
|
||||
|
||||
import duckdb
|
||||
import pytest
|
||||
|
||||
from langchain_community.vectorstores import DuckDB
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def duckdb_connection() -> Iterator[duckdb.DuckDBPyConnection]:
|
||||
# Setup a temporary DuckDB database
|
||||
conn = duckdb.connect(":memory:")
|
||||
yield conn
|
||||
conn.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def embeddings() -> FakeEmbeddings:
|
||||
return FakeEmbeddings()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def texts() -> List[str]:
|
||||
return ["text 1", "text 2", "item 3"]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def metadatas() -> List[Dict[str, str]]:
|
||||
return [
|
||||
{"source": "Document 1"},
|
||||
{"source": "Document 2"},
|
||||
{"source": "Document 3"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_with_connection(
|
||||
duckdb_connection: duckdb.DuckDBPyConnection,
|
||||
embeddings: FakeEmbeddings,
|
||||
texts: List[str],
|
||||
) -> None:
|
||||
store = DuckDB(
|
||||
connection=duckdb_connection, embedding=embeddings, table_name="test_table"
|
||||
)
|
||||
store.add_texts(texts)
|
||||
result = store.similarity_search("text 1")
|
||||
result_texts = [doc.page_content for doc in result]
|
||||
assert "text 1" in result_texts
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_without_connection(
|
||||
embeddings: FakeEmbeddings, texts: List[str]
|
||||
) -> None:
|
||||
store = DuckDB(embedding=embeddings, table_name="test_table")
|
||||
store.add_texts(texts)
|
||||
result = store.similarity_search("text 1")
|
||||
result_texts = [doc.page_content for doc in result]
|
||||
assert "text 1" in result_texts
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_add_texts(embeddings: FakeEmbeddings) -> None:
|
||||
store = DuckDB(embedding=embeddings, table_name="test_table")
|
||||
store.add_texts(["text 2"])
|
||||
result = store.similarity_search("text 2")
|
||||
result_texts = [doc.page_content for doc in result]
|
||||
assert "text 2" in result_texts
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_add_texts_with_metadata(
|
||||
duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
|
||||
) -> None:
|
||||
store = DuckDB(
|
||||
connection=duckdb_connection,
|
||||
embedding=embeddings,
|
||||
table_name="test_table_with_metadata",
|
||||
)
|
||||
texts = ["text with metadata 1", "text with metadata 2"]
|
||||
metadatas = [
|
||||
{"author": "Author 1", "date": "2021-01-01"},
|
||||
{"author": "Author 2", "date": "2021-02-01"},
|
||||
]
|
||||
|
||||
# Add texts along with their metadata
|
||||
store.add_texts(texts, metadatas=metadatas)
|
||||
|
||||
# Perform a similarity search to retrieve the documents
|
||||
result = store.similarity_search("text with metadata", k=2)
|
||||
|
||||
# Check if the metadata is correctly associated with the texts
|
||||
assert len(result) == 2, "Should return two results"
|
||||
assert (
|
||||
result[0].metadata.get("author") == "Author 1"
|
||||
), "Metadata for Author 1 should be correctly retrieved"
|
||||
assert (
|
||||
result[0].metadata.get("date") == "2021-01-01"
|
||||
), "Date for Author 1 should be correctly retrieved"
|
||||
assert (
|
||||
result[1].metadata.get("author") == "Author 2"
|
||||
), "Metadata for Author 2 should be correctly retrieved"
|
||||
assert (
|
||||
result[1].metadata.get("date") == "2021-02-01"
|
||||
), "Date for Author 2 should be correctly retrieved"
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_add_texts_with_predefined_ids(
|
||||
duckdb_connection: duckdb.DuckDBPyConnection, embeddings: FakeEmbeddings
|
||||
) -> None:
|
||||
store = DuckDB(
|
||||
connection=duckdb_connection,
|
||||
embedding=embeddings,
|
||||
table_name="test_table_predefined_ids",
|
||||
)
|
||||
texts = ["unique text 1", "unique text 2"]
|
||||
predefined_ids = [str(uuid4()), str(uuid4())] # Generate unique IDs
|
||||
|
||||
# Add texts with the predefined IDs
|
||||
store.add_texts(texts, ids=predefined_ids)
|
||||
|
||||
# Perform a similarity search for each text and check if it's found
|
||||
for text in texts:
|
||||
result = store.similarity_search(text)
|
||||
|
||||
found_texts = [doc.page_content for doc in result]
|
||||
assert (
|
||||
text in found_texts
|
||||
), f"Text '{text}' was not found in the search results."
|
||||
|
||||
|
||||
@pytest.mark.requires("duckdb")
|
||||
def test_duckdb_from_texts(
|
||||
duckdb_connection: duckdb.DuckDBPyConnection,
|
||||
embeddings: FakeEmbeddings,
|
||||
texts: List[str],
|
||||
metadatas: List[Dict[str, str]],
|
||||
) -> None:
|
||||
# Initialize DuckDB from texts using the from_texts class method
|
||||
store = DuckDB.from_texts(
|
||||
texts=texts,
|
||||
embedding=embeddings,
|
||||
metadatas=metadatas,
|
||||
connection=duckdb_connection,
|
||||
table_name="test_from_texts_table",
|
||||
)
|
||||
|
||||
# Perform a similarity search to retrieve the documents
|
||||
query_text = "sample text"
|
||||
result = store.similarity_search(query_text, k=2)
|
||||
|
||||
# Verify that the vector store was populated and can return results
|
||||
assert len(result) > 0, "Should return at least one result"
|
||||
|
||||
# Optionally, check that metadata is correctly associated with the texts
|
||||
for doc in result:
|
||||
assert "source" in doc.metadata, "Document metadata should include 'source' key"
|
Loading…
Reference in New Issue