community[patch]: Add vector index support to SingleStoreDB VectorStore (#17308)

This pull request introduces support for various Approximate Nearest
Neighbor (ANN) vector index algorithms in the VectorStore class,
starting from version 8.5 of SingleStore DB. Leveraging this enhancement
enables users to harness the power of vector indexing, significantly
boosting search speed, particularly when handling large sets of vectors.

---------

Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
erick/exa-lint
volodymyr-memsql 8 months ago committed by GitHub
parent 0bc4a9b3fc
commit e36bc379f2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -106,6 +106,14 @@
"print(docs[0].page_content)"
]
},
{
"cell_type": "markdown",
"id": "035cba66",
"metadata": {},
"source": [
"Enhance your search efficiency with SingleStore DB version 8.5 or above by leveraging [ANN vector indexes](https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/). By setting `use_vector_index=True` during vector store object creation, you can activate this feature. Additionally, if your vectors differ in dimensionality from the default OpenAI embedding size of 1536, ensure to specify the `vector_size` parameter accordingly. "
]
},
{
"cell_type": "code",
"execution_count": null,

@ -57,6 +57,10 @@ class SingleStoreDB(VectorStore):
content_field: str = "content",
metadata_field: str = "metadata",
vector_field: str = "vector",
use_vector_index: bool = False,
vector_index_name: str = "",
vector_index_options: Optional[dict] = None,
vector_size: int = 1536,
pool_size: int = 5,
max_overflow: int = 10,
timeout: float = 30,
@ -88,6 +92,27 @@ class SingleStoreDB(VectorStore):
vector_field (str, optional): Specifies the field to store the vector.
Defaults to "vector".
use_vector_index (bool, optional): Toggles the use of a vector index.
Works only with SingleStoreDB 8.5 or later. Defaults to False.
If set to True, vector_size parameter is required to be set to
a proper value.
vector_index_name (str, optional): Specifies the name of the vector index.
Defaults to empty. Will be ignored if use_vector_index is set to False.
vector_index_options (dict, optional): Specifies the options for
the vector index. Defaults to {}.
Will be ignored if use_vector_index is set to False. The options are:
index_type (str, optional): Specifies the type of the index.
Defaults to IVF_PQFS.
For more options, please refer to the SingleStoreDB documentation:
https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/
vector_size (int, optional): Specifies the size of the vector.
Defaults to 1536. Required if use_vector_index is set to True.
Should be set to the same value as the size of the vectors
stored in the vector_field.
Following arguments pertain to the connection pool:
pool_size (int, optional): Determines the number of active connections in
@ -177,6 +202,19 @@ class SingleStoreDB(VectorStore):
os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
vectorstore = SingleStoreDB(OpenAIEmbeddings())
Using vector index:
.. code-block:: python
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import SingleStoreDB
os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
vectorstore = SingleStoreDB(
OpenAIEmbeddings(),
use_vector_index=True,
)
"""
self.embedding = embedding
@ -186,6 +224,12 @@ class SingleStoreDB(VectorStore):
self.metadata_field = self._sanitize_input(metadata_field)
self.vector_field = self._sanitize_input(vector_field)
self.use_vector_index = bool(use_vector_index)
self.vector_index_name = self._sanitize_input(vector_index_name)
self.vector_index_options = dict(vector_index_options or {})
self.vector_index_options["metric_type"] = self.distance_strategy
self.vector_size = int(vector_size)
# Pass the rest of the kwargs to the connection.
self.connection_kwargs = kwargs
@ -194,7 +238,7 @@ class SingleStoreDB(VectorStore):
self.connection_kwargs["conn_attrs"] = dict()
self.connection_kwargs["conn_attrs"]["_connector_name"] = "langchain python sdk"
self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.1"
self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.2"
# Create connection pool.
self.connection_pool = QueuePool(
@ -222,16 +266,38 @@ class SingleStoreDB(VectorStore):
try:
cur = conn.cursor()
try:
cur.execute(
"""CREATE TABLE IF NOT EXISTS {}
({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
{} BLOB, {} JSON);""".format(
self.table_name,
self.content_field,
self.vector_field,
self.metadata_field,
),
)
if self.use_vector_index:
index_options = ""
if self.vector_index_options and len(self.vector_index_options) > 0:
index_options = "INDEX_OPTIONS '{}'".format(
json.dumps(self.vector_index_options)
)
cur.execute(
"""CREATE TABLE IF NOT EXISTS {}
({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
{} VECTOR({}, F32) NOT NULL, {} JSON,
VECTOR INDEX {} ({}) {});""".format(
self.table_name,
self.content_field,
self.vector_field,
self.vector_size,
self.metadata_field,
self.vector_index_name,
self.vector_field,
index_options,
),
)
else:
cur.execute(
"""CREATE TABLE IF NOT EXISTS {}
({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
{} BLOB, {} JSON);""".format(
self.table_name,
self.content_field,
self.vector_field,
self.metadata_field,
),
)
finally:
cur.close()
finally:
@ -279,6 +345,8 @@ class SingleStoreDB(VectorStore):
json.dumps(metadata),
),
)
if self.use_vector_index:
cur.execute("OPTIMIZE TABLE {} FLUSH;".format(self.table_name))
finally:
cur.close()
finally:
@ -406,6 +474,10 @@ class SingleStoreDB(VectorStore):
content_field: str = "content",
metadata_field: str = "metadata",
vector_field: str = "vector",
use_vector_index: bool = False,
vector_index_name: str = "",
vector_index_options: Optional[dict] = None,
vector_size: int = 1536,
pool_size: int = 5,
max_overflow: int = 10,
timeout: float = 30,
@ -438,6 +510,10 @@ class SingleStoreDB(VectorStore):
pool_size=pool_size,
max_overflow=max_overflow,
timeout=timeout,
use_vector_index=use_vector_index,
vector_index_name=vector_index_name,
vector_index_options=vector_index_options,
vector_size=vector_size,
**kwargs,
)
instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs)

@ -4,6 +4,7 @@ from typing import List
import numpy as np
import pytest
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores.singlestoredb import SingleStoreDB
from langchain_community.vectorstores.utils import DistanceStrategy
@ -43,6 +44,16 @@ class NormilizedFakeEmbeddings(FakeEmbeddings):
return self.normalize(super().embed_query(text))
class RandomEmbeddings(Embeddings):
"""Fake embeddings with random vectors. For testing purposes."""
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [np.random.rand(100).tolist() for _ in texts]
def embed_query(self, text: str) -> List[float]:
return np.random.rand(100).tolist()
@pytest.fixture
def texts() -> List[str]:
return ["foo", "bar", "baz"]
@ -99,6 +110,66 @@ def test_singlestoredb_euclidean_distance(texts: List[str]) -> None:
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_vector_index_1(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_vector_index_1"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
use_vector_index=True,
vector_size=10,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_vector_index_2(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_vector_index_2"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
FakeEmbeddings(),
table_name=table_name,
use_vector_index=True,
vector_index_options={"index_type": "IVF_PQ", "nlist": 256},
vector_size=10,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=1)
output[0].page_content == "foo"
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_vector_index_large() -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_vector_index_large"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
["foo"] * 300000,
RandomEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
use_vector_index=True,
vector_size=100,
vector_index_name="vector_index_large",
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1)
assert output[0].page_content == "foo"
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_from_existing(texts: List[str]) -> None:
"""Test adding a new document"""

Loading…
Cancel
Save