Update SinglStoreDB vectorstore (#6423)

1. Introduced new distance strategies support: **DOT_PRODUCT** and
**EUCLIDEAN_DISTANCE** for enhanced flexibility.
2. Implemented a feature to filter results based on metadata fields.
3. Incorporated connection attributes specifying "langchain python sdk"
usage for enhanced traceability and debugging.
4. Expanded the suite of integration tests for improved code
reliability.
5. Updated the existing notebook with the usage example

@dev2049

---------

Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com>
Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
volodymyr-memsql 2023-06-20 08:08:58 +03:00 committed by GitHub
parent 6efd5fa2b9
commit d2e9b621ab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 812 additions and 46 deletions

View File

@ -5,9 +5,8 @@
"id": "2b9582dc", "id": "2b9582dc",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# SingleStoreDB vector search\n", "# SingleStoreDB\n",
"[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n", "[SingleStoreDB](https://singlestore.com/) is a high-performance distributed SQL database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. It provides vector storage, and vector functions including [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html) and [euclidean_distance](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/euclidean_distance.html), thereby supporting AI applications that require text similarity matching. This tutorial illustrates how to [work with vector data in SingleStoreDB](https://docs.singlestore.com/managed-service/en/developer-resources/functional-extensions/working-with-vector-data.html)."
"This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store."
] ]
}, },
{ {
@ -58,10 +57,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Load text samples\n", "# Load text samples \n",
"from langchain.document_loaders import TextLoader\n", "loader = TextLoader('../../../state_of_the_union.txt')\n",
"\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"documents = loader.load()\n", "documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n", "docs = text_splitter.split_documents(documents)\n",
@ -91,7 +88,7 @@
"docsearch = SingleStoreDB.from_documents(\n", "docsearch = SingleStoreDB.from_documents(\n",
" docs,\n", " docs,\n",
" embeddings,\n", " embeddings,\n",
" table_name=\"noteook\", # use table with a custom name\n", " table_name = \"notebook\", # use table with a custom name \n",
")" ")"
] ]
}, },

View File

@ -1,6 +1,7 @@
"""Wrapper around SingleStore DB.""" """Wrapper around SingleStore DB."""
from __future__ import annotations from __future__ import annotations
import enum
import json import json
from typing import ( from typing import (
Any, Any,
@ -20,6 +21,19 @@ from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore, VectorStoreRetriever from langchain.vectorstores.base import VectorStore, VectorStoreRetriever
class DistanceStrategy(str, enum.Enum):
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
DOT_PRODUCT = "DOT_PRODUCT"
DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT
ORDERING_DIRECTIVE: dict = {
DistanceStrategy.EUCLIDEAN_DISTANCE: "",
DistanceStrategy.DOT_PRODUCT: "DESC",
}
class SingleStoreDB(VectorStore): class SingleStoreDB(VectorStore):
""" """
This class serves as a Pythonic interface to the SingleStore DB database. This class serves as a Pythonic interface to the SingleStore DB database.
@ -45,6 +59,7 @@ class SingleStoreDB(VectorStore):
self, self,
embedding: Embeddings, embedding: Embeddings,
*, *,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
table_name: str = "embeddings", table_name: str = "embeddings",
content_field: str = "content", content_field: str = "content",
metadata_field: str = "metadata", metadata_field: str = "metadata",
@ -59,6 +74,18 @@ class SingleStoreDB(VectorStore):
Args: Args:
embedding (Embeddings): A text embedding model. embedding (Embeddings): A text embedding model.
distance_strategy (DistanceStrategy, optional):
Determines the strategy employed for calculating
the distance between vectors in the embedding space.
Defaults to DOT_PRODUCT.
Available options are:
- DOT_PRODUCT: Computes the scalar product of two vectors.
This is the default behavior
- EUCLIDEAN_DISTANCE: Computes the Euclidean distance between
two vectors. This metric considers the geometric distance in
the vector space, and might be more suitable for embeddings
that rely on spatial relationships.
table_name (str, optional): Specifies the name of the table in use. table_name (str, optional): Specifies the name of the table in use.
Defaults to "embeddings". Defaults to "embeddings".
content_field (str, optional): Specifies the field to store the content. content_field (str, optional): Specifies the field to store the content.
@ -137,6 +164,7 @@ class SingleStoreDB(VectorStore):
vectorstore = SingleStoreDB( vectorstore = SingleStoreDB(
OpenAIEmbeddings(), OpenAIEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
host="127.0.0.1", host="127.0.0.1",
port=3306, port=3306,
user="user", user="user",
@ -159,6 +187,7 @@ class SingleStoreDB(VectorStore):
""" """
self.embedding = embedding self.embedding = embedding
self.distance_strategy = distance_strategy
self.table_name = table_name self.table_name = table_name
self.content_field = content_field self.content_field = content_field
self.metadata_field = metadata_field self.metadata_field = metadata_field
@ -167,6 +196,17 @@ class SingleStoreDB(VectorStore):
"""Pass the rest of the kwargs to the connection.""" """Pass the rest of the kwargs to the connection."""
self.connection_kwargs = kwargs self.connection_kwargs = kwargs
"""Add program name and version to connection attributes."""
if "conn_attrs" not in self.connection_kwargs:
self.connection_kwargs["conn_attrs"] = dict()
if "program_name" not in self.connection_kwargs["conn_attrs"]:
self.connection_kwargs["conn_attrs"][
"program_name"
] = "langchain python sdk"
self.connection_kwargs["conn_attrs"][
"program_version"
] = "0.0.205" # the version of SingleStoreDB VectorStore implementation
"""Create connection pool.""" """Create connection pool."""
self.connection_pool = QueuePool( self.connection_pool = QueuePool(
self._get_connection, self._get_connection,
@ -246,7 +286,7 @@ class SingleStoreDB(VectorStore):
return [] return []
def similarity_search( def similarity_search(
self, query: str, k: int = 4, **kwargs: Any self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
) -> List[Document]: ) -> List[Document]:
"""Returns the most similar indexed documents to the query text. """Returns the most similar indexed documents to the query text.
@ -255,21 +295,38 @@ class SingleStoreDB(VectorStore):
Args: Args:
query (str): The query text for which to find similar documents. query (str): The query text for which to find similar documents.
k (int): The number of documents to return. Default is 4. k (int): The number of documents to return. Default is 4.
filter (dict): A dictionary of metadata fields and values to filter by.
Returns: Returns:
List[Document]: A list of documents that are most similar to the query text. List[Document]: A list of documents that are most similar to the query text.
Examples:
.. code-block:: python
from langchain.vectorstores import SingleStoreDB
from langchain.embeddings import OpenAIEmbeddings
s2 = SingleStoreDB.from_documents(
docs,
OpenAIEmbeddings(),
host="username:password@localhost:3306/database"
)
s2.similarity_search("query text", 1,
{"metadata_field": "metadata_value"})
""" """
docs_and_scores = self.similarity_search_with_score(query, k=k) docs_and_scores = self.similarity_search_with_score(
query=query, k=k, filter=filter
)
return [doc for doc, _ in docs_and_scores] return [doc for doc, _ in docs_and_scores]
def similarity_search_with_score( def similarity_search_with_score(
self, query: str, k: int = 4 self, query: str, k: int = 4, filter: Optional[dict] = None
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return docs most similar to query. Uses cosine similarity. """Return docs most similar to query. Uses cosine similarity.
Args: Args:
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.
Returns: Returns:
List of Documents most similar to the query and score for each List of Documents most similar to the query and score for each
@ -278,21 +335,52 @@ class SingleStoreDB(VectorStore):
embedding = self.embedding.embed_query(query) embedding = self.embedding.embed_query(query)
conn = self.connection_pool.connect() conn = self.connection_pool.connect()
result = [] result = []
where_clause: str = ""
where_clause_values: List[Any] = []
if filter:
where_clause = "WHERE "
arguments = []
def build_where_clause(
where_clause_values: List[Any],
sub_filter: dict,
prefix_args: List[str] = [],
) -> None:
for key in sub_filter.keys():
if isinstance(sub_filter[key], dict):
build_where_clause(
where_clause_values, sub_filter[key], prefix_args + [key]
)
else:
arguments.append(
"JSON_EXTRACT_JSON({}, {}) = %s".format(
self.metadata_field,
", ".join(["%s"] * (len(prefix_args) + 1)),
)
)
where_clause_values += prefix_args + [key]
where_clause_values.append(json.dumps(sub_filter[key]))
build_where_clause(where_clause_values, filter)
where_clause += " AND ".join(arguments)
try: try:
cur = conn.cursor() cur = conn.cursor()
try: try:
cur.execute( cur.execute(
"""SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score
FROM {} ORDER BY __score DESC LIMIT %s""".format( FROM {} {} ORDER BY __score {} LIMIT %s""".format(
self.content_field, self.content_field,
self.metadata_field, self.metadata_field,
self.distance_strategy,
self.vector_field, self.vector_field,
self.table_name, self.table_name,
where_clause,
ORDERING_DIRECTIVE[self.distance_strategy],
), ),
( ("[{}]".format(",".join(map(str, embedding))),)
"[{}]".format(",".join(map(str, embedding))), + tuple(where_clause_values)
k, + (k,),
),
) )
for row in cur.fetchall(): for row in cur.fetchall():
@ -310,6 +398,7 @@ class SingleStoreDB(VectorStore):
texts: List[str], texts: List[str],
embedding: Embeddings, embedding: Embeddings,
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
table_name: str = "embeddings", table_name: str = "embeddings",
content_field: str = "content", content_field: str = "content",
metadata_field: str = "metadata", metadata_field: str = "metadata",
@ -338,6 +427,7 @@ class SingleStoreDB(VectorStore):
instance = cls( instance = cls(
embedding, embedding,
distance_strategy=distance_strategy,
table_name=table_name, table_name=table_name,
content_field=content_field, content_field=content_field,
metadata_field=metadata_field, metadata_field=metadata_field,

525
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -101,7 +101,7 @@ azure-cognitiveservices-speech = {version = "^1.28.0", optional = true}
py-trello = {version = "^0.19.0", optional = true} py-trello = {version = "^0.19.0", optional = true}
momento = {version = "^1.5.0", optional = true} momento = {version = "^1.5.0", optional = true}
bibtexparser = {version = "^1.4.0", optional = true} bibtexparser = {version = "^1.4.0", optional = true}
singlestoredb = {version = "^0.6.1", optional = true} singlestoredb = {version = "^0.7.1", optional = true}
pyspark = {version = "^3.4.0", optional = true} pyspark = {version = "^3.4.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true} nebula3-python = {version = "^3.4.0", optional = true}

View File

@ -5,7 +5,7 @@ import numpy as np
import pytest import pytest
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.vectorstores.singlestoredb import SingleStoreDB from langchain.vectorstores.singlestoredb import DistanceStrategy, SingleStoreDB
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db" TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"
@ -80,6 +80,24 @@ def test_singlestoredb_new_vector(texts: List[str]) -> None:
drop(table_name) drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_euclidean_distance(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_euclidean_distance"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") @pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_from_existing(texts: List[str]) -> None: def test_singlestoredb_from_existing(texts: List[str]) -> None:
"""Test adding a new document""" """Test adding a new document"""
@ -140,3 +158,193 @@ def test_singlestoredb_add_texts_to_existing(texts: List[str]) -> None:
output = docsearch.similarity_search("foo", k=2) output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT assert output == TEST_RESULT
drop(table_name) drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata(texts: List[str]) -> None:
"""Test filtering by metadata"""
table_name = "test_singlestoredb_filter_metadata"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"index": 2})
assert output == [Document(page_content="baz", metadata={"index": 2})]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None:
"""Test filtering by metadata field that is similar for each document"""
table_name = "test_singlestoredb_filter_metadata_2"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"category": "budget"})
assert output == [
Document(page_content="foo", metadata={"index": 0, "category": "budget"})
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None:
"""Test filtering by two metadata fields"""
table_name = "test_singlestoredb_filter_metadata_3"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "index": 1}
)
assert output == [
Document(page_content="bar", metadata={"index": 1, "category": "budget"})
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_4(texts: List[str]) -> None:
"""Test no matches"""
table_name = "test_singlestoredb_filter_metadata_4"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"category": "vacation"})
assert output == []
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None:
"""Test complex metadata path"""
table_name = "test_singlestoredb_filter_metadata_5"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={
"index": i,
"category": "budget",
"subfield": {"subfield": {"idx": i, "other_idx": i + 1}},
},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "subfield": {"subfield": {"idx": 2}}}
)
assert output == [
Document(
page_content="baz",
metadata={
"index": 2,
"category": "budget",
"subfield": {"subfield": {"idx": 2, "other_idx": 3}},
},
)
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None:
"""Test filtering by other bool"""
table_name = "test_singlestoredb_filter_metadata_6"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={"index": i, "category": "budget", "is_good": i == 1},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "is_good": True}
)
assert output == [
Document(
page_content="bar",
metadata={"index": 1, "category": "budget", "is_good": True},
)
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None:
"""Test filtering by float"""
table_name = "test_singlestoredb_filter_metadata_7"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={"index": i, "category": "budget", "score": i + 0.5},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"bar", k=1, filter={"category": "budget", "score": 2.5}
)
assert output == [
Document(
page_content="baz",
metadata={"index": 2, "category": "budget", "score": 2.5},
)
]
drop(table_name)