community[patch]: Add vector index support to SingleStoreDB VectorStore (#17308)

This pull request introduces support for various Approximate Nearest Neighbor (ANN) vector index algorithms in the VectorStore class, starting from version 8.5 of SingleStore DB. Leveraging this enhancement enables users to harness the power of vector indexing, significantly boosting search speed, particularly when handling large sets of vectors. --------- Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
8 months ago · e36bc379f2
parent 0bc4a9b3fc
commit e36bc379f2
3 changed files with 166 additions and 11 deletions
--- a/docs/docs/integrations/vectorstores/singlestoredb.ipynb
+++ b/docs/docs/integrations/vectorstores/singlestoredb.ipynb
@ -106,6 +106,14 @@
    "print(docs[0].page_content)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "035cba66",
+   "metadata": {},
+   "source": [
+    "Enhance your search efficiency with SingleStore DB version 8.5 or above by leveraging [ANN vector indexes](https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/). By setting `use_vector_index=True` during vector store object creation, you can activate this feature. Additionally, if your vectors differ in dimensionality from the default OpenAI embedding size of 1536, ensure to specify the `vector_size` parameter accordingly. "
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/libs/community/langchain_community/vectorstores/singlestoredb.py
+++ b/libs/community/langchain_community/vectorstores/singlestoredb.py
@ -57,6 +57,10 @@ class SingleStoreDB(VectorStore):
        content_field: str = "content",
        metadata_field: str = "metadata",
        vector_field: str = "vector",
+        use_vector_index: bool = False,
+        vector_index_name: str = "",
+        vector_index_options: Optional[dict] = None,
+        vector_size: int = 1536,
        pool_size: int = 5,
        max_overflow: int = 10,
        timeout: float = 30,
@ -88,6 +92,27 @@ class SingleStoreDB(VectorStore):
            vector_field (str, optional): Specifies the field to store the vector.
                Defaults to "vector".

+            use_vector_index (bool, optional): Toggles the use of a vector index.
+                Works only with SingleStoreDB 8.5 or later. Defaults to False.
+                If set to True, vector_size parameter is required to be set to
+                a proper value.
+
+            vector_index_name (str, optional): Specifies the name of the vector index.
+                Defaults to empty. Will be ignored if use_vector_index is set to False.
+
+            vector_index_options (dict, optional): Specifies the options for
+                the vector index. Defaults to {}.
+                Will be ignored if use_vector_index is set to False. The options are:
+                index_type (str, optional): Specifies the type of the index.
+                    Defaults to IVF_PQFS.
+                For more options, please refer to the SingleStoreDB documentation:
+                https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/
+
+            vector_size (int, optional): Specifies the size of the vector.
+                Defaults to 1536. Required if use_vector_index is set to True.
+                Should be set to the same value as the size of the vectors
+                stored in the vector_field.
+
            Following arguments pertain to the connection pool:

            pool_size (int, optional): Determines the number of active connections in
@ -177,6 +202,19 @@ class SingleStoreDB(VectorStore):

                os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
                vectorstore = SingleStoreDB(OpenAIEmbeddings())
+
+            Using vector index:
+
+            .. code-block:: python
+
+                from langchain_community.embeddings import OpenAIEmbeddings
+                from langchain_community.vectorstores import SingleStoreDB
+
+                os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
+                vectorstore = SingleStoreDB(
+                    OpenAIEmbeddings(),
+                    use_vector_index=True,
+                )
        """

        self.embedding = embedding
@ -186,6 +224,12 @@ class SingleStoreDB(VectorStore):
        self.metadata_field = self._sanitize_input(metadata_field)
        self.vector_field = self._sanitize_input(vector_field)

+        self.use_vector_index = bool(use_vector_index)
+        self.vector_index_name = self._sanitize_input(vector_index_name)
+        self.vector_index_options = dict(vector_index_options or {})
+        self.vector_index_options["metric_type"] = self.distance_strategy
+        self.vector_size = int(vector_size)
+
        # Pass the rest of the kwargs to the connection.
        self.connection_kwargs = kwargs

@ -194,7 +238,7 @@ class SingleStoreDB(VectorStore):
            self.connection_kwargs["conn_attrs"] = dict()

        self.connection_kwargs["conn_attrs"]["_connector_name"] = "langchain python sdk"
-        self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.1"
+        self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.2"

        # Create connection pool.
        self.connection_pool = QueuePool(
@ -222,16 +266,38 @@ class SingleStoreDB(VectorStore):
        try:
            cur = conn.cursor()
            try:
-                cur.execute(
-                    """CREATE TABLE IF NOT EXISTS {}
-                    ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
-                    {} BLOB, {} JSON);""".format(
-                        self.table_name,
-                        self.content_field,
-                        self.vector_field,
-                        self.metadata_field,
-                    ),
-                )
+                if self.use_vector_index:
+                    index_options = ""
+                    if self.vector_index_options and len(self.vector_index_options) > 0:
+                        index_options = "INDEX_OPTIONS '{}'".format(
+                            json.dumps(self.vector_index_options)
+                        )
+                    cur.execute(
+                        """CREATE TABLE IF NOT EXISTS {}
+                        ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
+                        {} VECTOR({}, F32) NOT NULL, {} JSON,
+                        VECTOR INDEX {} ({}) {});""".format(
+                            self.table_name,
+                            self.content_field,
+                            self.vector_field,
+                            self.vector_size,
+                            self.metadata_field,
+                            self.vector_index_name,
+                            self.vector_field,
+                            index_options,
+                        ),
+                    )
+                else:
+                    cur.execute(
+                        """CREATE TABLE IF NOT EXISTS {}
+                        ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
+                        {} BLOB, {} JSON);""".format(
+                            self.table_name,
+                            self.content_field,
+                            self.vector_field,
+                            self.metadata_field,
+                        ),
+                    )
            finally:
                cur.close()
        finally:
@ -279,6 +345,8 @@ class SingleStoreDB(VectorStore):
                            json.dumps(metadata),
                        ),
                    )
+                if self.use_vector_index:
+                    cur.execute("OPTIMIZE TABLE {} FLUSH;".format(self.table_name))
            finally:
                cur.close()
        finally:
@ -406,6 +474,10 @@ class SingleStoreDB(VectorStore):
        content_field: str = "content",
        metadata_field: str = "metadata",
        vector_field: str = "vector",
+        use_vector_index: bool = False,
+        vector_index_name: str = "",
+        vector_index_options: Optional[dict] = None,
+        vector_size: int = 1536,
        pool_size: int = 5,
        max_overflow: int = 10,
        timeout: float = 30,
@ -438,6 +510,10 @@ class SingleStoreDB(VectorStore):
            pool_size=pool_size,
            max_overflow=max_overflow,
            timeout=timeout,
+            use_vector_index=use_vector_index,
+            vector_index_name=vector_index_name,
+            vector_index_options=vector_index_options,
+            vector_size=vector_size,
            **kwargs,
        )
        instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs)
--- a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py
@ -4,6 +4,7 @@ from typing import List
 import numpy as np
 import pytest
 from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings

 from langchain_community.vectorstores.singlestoredb import SingleStoreDB
 from langchain_community.vectorstores.utils import DistanceStrategy
@ -43,6 +44,16 @@ class NormilizedFakeEmbeddings(FakeEmbeddings):
        return self.normalize(super().embed_query(text))


+class RandomEmbeddings(Embeddings):
+    """Fake embeddings with random vectors. For testing purposes."""
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        return [np.random.rand(100).tolist() for _ in texts]
+
+    def embed_query(self, text: str) -> List[float]:
+        return np.random.rand(100).tolist()
+
+
@pytest.fixture
 def texts() -> List[str]:
    return ["foo", "bar", "baz"]
@ -99,6 +110,66 @@ def test_singlestoredb_euclidean_distance(texts: List[str]) -> None:
    drop(table_name)


+@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
+def test_singlestoredb_vector_index_1(texts: List[str]) -> None:
+    """Test adding a new document"""
+    table_name = "test_singlestoredb_vector_index_1"
+    drop(table_name)
+    docsearch = SingleStoreDB.from_texts(
+        texts,
+        FakeEmbeddings(),
+        distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
+        table_name=table_name,
+        use_vector_index=True,
+        vector_size=10,
+        host=TEST_SINGLESTOREDB_URL,
+    )
+    docsearch.add_texts(["foo"])
+    output = docsearch.similarity_search("foo", k=2)
+    assert output == TEST_RESULT
+    drop(table_name)
+
+
+@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
+def test_singlestoredb_vector_index_2(texts: List[str]) -> None:
+    """Test adding a new document"""
+    table_name = "test_singlestoredb_vector_index_2"
+    drop(table_name)
+    docsearch = SingleStoreDB.from_texts(
+        texts,
+        FakeEmbeddings(),
+        table_name=table_name,
+        use_vector_index=True,
+        vector_index_options={"index_type": "IVF_PQ", "nlist": 256},
+        vector_size=10,
+        host=TEST_SINGLESTOREDB_URL,
+    )
+    docsearch.add_texts(["foo"])
+    output = docsearch.similarity_search("foo", k=1)
+    output[0].page_content == "foo"
+    drop(table_name)
+
+
+@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
+def test_singlestoredb_vector_index_large() -> None:
+    """Test adding a new document"""
+    table_name = "test_singlestoredb_vector_index_large"
+    drop(table_name)
+    docsearch = SingleStoreDB.from_texts(
+        ["foo"] * 300000,
+        RandomEmbeddings(),
+        distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
+        table_name=table_name,
+        use_vector_index=True,
+        vector_size=100,
+        vector_index_name="vector_index_large",
+        host=TEST_SINGLESTOREDB_URL,
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output[0].page_content == "foo"
+    drop(table_name)
+
+
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
 def test_singlestoredb_from_existing(texts: List[str]) -> None:
    """Test adding a new document"""