langchain/tests/integration_tests/vectorstores/test_singlestoredb.py
Raymond Yuan 5171c3bcca
Refactor vector storage to correctly handle relevancy scores (#6570)
Description: This pull request aims to support generating the correct
generic relevancy scores for different vector stores by refactoring the
relevance score functions and their selection in the base class and
subclasses of VectorStore. This is especially relevant with VectorStores
that require a distance metric upon initialization. Note many of the
current implenetations of `_similarity_search_with_relevance_scores` are
not technically correct, as they just return
`self.similarity_search_with_score(query, k, **kwargs)` without applying
the relevant score function

Also includes changes associated with:
https://github.com/hwchase17/langchain/pull/6564 and
https://github.com/hwchase17/langchain/pull/6494

See more indepth discussion in thread in #6494 

Issue: 
https://github.com/hwchase17/langchain/issues/6526
https://github.com/hwchase17/langchain/issues/6481
https://github.com/hwchase17/langchain/issues/6346

Dependencies: None

The changes include:
- Properly handling score thresholding in FAISS
`similarity_search_with_score_by_vector` for the corresponding distance
metric.
- Refactoring the `_similarity_search_with_relevance_scores` method in
the base class and removing it from the subclasses for incorrectly
implemented subclasses.
- Adding a `_select_relevance_score_fn` method in the base class and
implementing it in the subclasses to select the appropriate relevance
score function based on the distance strategy.
- Updating the `__init__` methods of the subclasses to set the
`relevance_score_fn` attribute.
- Removing the `_default_relevance_score_fn` function from the FAISS
class and using the base class's `_euclidean_relevance_score_fn`
instead.
- Adding the `DistanceStrategy` enum to the `utils.py` file and updating
the imports in the vector store classes.
- Updating the tests to import the `DistanceStrategy` enum from the
`utils.py` file.

---------

Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
2023-07-10 20:37:03 -07:00

352 lines
12 KiB
Python

"""Test SingleStoreDB functionality."""
from typing import List
import numpy as np
import pytest
from langchain.docstore.document import Document
from langchain.vectorstores.singlestoredb import SingleStoreDB
from langchain.vectorstores.utils import DistanceStrategy
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"
TEST_SINGLE_RESULT = [Document(page_content="foo")]
TEST_SINGLE_WITH_METADATA_RESULT = [Document(page_content="foo", metadata={"a": "b"})]
TEST_RESULT = [Document(page_content="foo"), Document(page_content="foo")]
try:
import singlestoredb as s2
singlestoredb_installed = True
except ImportError:
singlestoredb_installed = False
def drop(table_name: str) -> None:
with s2.connect(TEST_SINGLESTOREDB_URL) as conn:
conn.autocommit(True)
with conn.cursor() as cursor:
cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
class NormilizedFakeEmbeddings(FakeEmbeddings):
"""Fake embeddings with normalization. For testing purposes."""
def normalize(self, vector: List[float]) -> List[float]:
"""Normalize vector."""
return [float(v / np.linalg.norm(vector)) for v in vector]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [self.normalize(v) for v in super().embed_documents(texts)]
def embed_query(self, text: str) -> List[float]:
return self.normalize(super().embed_query(text))
@pytest.fixture
def texts() -> List[str]:
return ["foo", "bar", "baz"]
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb(texts: List[str]) -> None:
"""Test end to end construction and search."""
table_name = "test_singlestoredb"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1)
assert output == TEST_SINGLE_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_new_vector(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_new_vector"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_euclidean_distance(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_euclidean_distance"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
texts,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_from_existing(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_from_existing"
drop(table_name)
SingleStoreDB.from_texts(
texts,
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
# Test creating from an existing
docsearch2 = SingleStoreDB(
NormilizedFakeEmbeddings(),
table_name="test_singlestoredb_from_existing",
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch2.similarity_search("foo", k=1)
assert output == TEST_SINGLE_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_from_documents(texts: List[str]) -> None:
"""Test from_documents constructor."""
table_name = "test_singlestoredb_from_documents"
drop(table_name)
docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts]
docsearch = SingleStoreDB.from_documents(
docs,
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1)
assert output == TEST_SINGLE_WITH_METADATA_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_add_texts_to_existing(texts: List[str]) -> None:
"""Test adding a new document"""
table_name = "test_singlestoredb_add_texts_to_existing"
drop(table_name)
# Test creating from an existing
SingleStoreDB.from_texts(
texts,
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
docsearch = SingleStoreDB(
NormilizedFakeEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_texts(["foo"])
output = docsearch.similarity_search("foo", k=2)
assert output == TEST_RESULT
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata(texts: List[str]) -> None:
"""Test filtering by metadata"""
table_name = "test_singlestoredb_filter_metadata"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"index": 2})
assert output == [Document(page_content="baz", metadata={"index": 2})]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None:
"""Test filtering by metadata field that is similar for each document"""
table_name = "test_singlestoredb_filter_metadata_2"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"category": "budget"})
assert output == [
Document(page_content="foo", metadata={"index": 0, "category": "budget"})
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None:
"""Test filtering by two metadata fields"""
table_name = "test_singlestoredb_filter_metadata_3"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "index": 1}
)
assert output == [
Document(page_content="bar", metadata={"index": 1, "category": "budget"})
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_4(texts: List[str]) -> None:
"""Test no matches"""
table_name = "test_singlestoredb_filter_metadata_4"
drop(table_name)
docs = [
Document(page_content=t, metadata={"index": i, "category": "budget"})
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search("foo", k=1, filter={"category": "vacation"})
assert output == []
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None:
"""Test complex metadata path"""
table_name = "test_singlestoredb_filter_metadata_5"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={
"index": i,
"category": "budget",
"subfield": {"subfield": {"idx": i, "other_idx": i + 1}},
},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "subfield": {"subfield": {"idx": 2}}}
)
assert output == [
Document(
page_content="baz",
metadata={
"index": 2,
"category": "budget",
"subfield": {"subfield": {"idx": 2, "other_idx": 3}},
},
)
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None:
"""Test filtering by other bool"""
table_name = "test_singlestoredb_filter_metadata_6"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={"index": i, "category": "budget", "is_good": i == 1},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"foo", k=1, filter={"category": "budget", "is_good": True}
)
assert output == [
Document(
page_content="bar",
metadata={"index": 1, "category": "budget", "is_good": True},
)
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None:
"""Test filtering by float"""
table_name = "test_singlestoredb_filter_metadata_7"
drop(table_name)
docs = [
Document(
page_content=t,
metadata={"index": i, "category": "budget", "score": i + 0.5},
)
for i, t in enumerate(texts)
]
docsearch = SingleStoreDB.from_documents(
docs,
FakeEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"bar", k=1, filter={"category": "budget", "score": 2.5}
)
assert output == [
Document(
page_content="baz",
metadata={"index": 2, "category": "budget", "score": 2.5},
)
]
drop(table_name)