mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
d2e9b621ab
1. Introduced new distance strategies support: **DOT_PRODUCT** and **EUCLIDEAN_DISTANCE** for enhanced flexibility. 2. Implemented a feature to filter results based on metadata fields. 3. Incorporated connection attributes specifying "langchain python sdk" usage for enhanced traceability and debugging. 4. Expanded the suite of integration tests for improved code reliability. 5. Updated the existing notebook with the usage example @dev2049 --------- Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
351 lines
12 KiB
Python
351 lines
12 KiB
Python
"""Test SingleStoreDB functionality."""
|
|
from typing import List
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.vectorstores.singlestoredb import DistanceStrategy, SingleStoreDB
|
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
|
|
|
TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"
|
|
TEST_SINGLE_RESULT = [Document(page_content="foo")]
|
|
TEST_SINGLE_WITH_METADATA_RESULT = [Document(page_content="foo", metadata={"a": "b"})]
|
|
TEST_RESULT = [Document(page_content="foo"), Document(page_content="foo")]
|
|
|
|
try:
|
|
import singlestoredb as s2
|
|
|
|
singlestoredb_installed = True
|
|
except ImportError:
|
|
singlestoredb_installed = False
|
|
|
|
|
|
def drop(table_name: str) -> None:
|
|
with s2.connect(TEST_SINGLESTOREDB_URL) as conn:
|
|
conn.autocommit(True)
|
|
with conn.cursor() as cursor:
|
|
cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
|
|
|
|
|
|
class NormilizedFakeEmbeddings(FakeEmbeddings):
|
|
"""Fake embeddings with normalization. For testing purposes."""
|
|
|
|
def normalize(self, vector: List[float]) -> List[float]:
|
|
"""Normalize vector."""
|
|
return [float(v / np.linalg.norm(vector)) for v in vector]
|
|
|
|
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
|
return [self.normalize(v) for v in super().embed_documents(texts)]
|
|
|
|
def embed_query(self, text: str) -> List[float]:
|
|
return self.normalize(super().embed_query(text))
|
|
|
|
|
|
@pytest.fixture
|
|
def texts() -> List[str]:
|
|
return ["foo", "bar", "baz"]
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb(texts: List[str]) -> None:
|
|
"""Test end to end construction and search."""
|
|
table_name = "test_singlestoredb"
|
|
drop(table_name)
|
|
docsearch = SingleStoreDB.from_texts(
|
|
texts,
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == TEST_SINGLE_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_new_vector(texts: List[str]) -> None:
|
|
"""Test adding a new document"""
|
|
table_name = "test_singlestoredb_new_vector"
|
|
drop(table_name)
|
|
docsearch = SingleStoreDB.from_texts(
|
|
texts,
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
docsearch.add_texts(["foo"])
|
|
output = docsearch.similarity_search("foo", k=2)
|
|
assert output == TEST_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_euclidean_distance(texts: List[str]) -> None:
|
|
"""Test adding a new document"""
|
|
table_name = "test_singlestoredb_euclidean_distance"
|
|
drop(table_name)
|
|
docsearch = SingleStoreDB.from_texts(
|
|
texts,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
docsearch.add_texts(["foo"])
|
|
output = docsearch.similarity_search("foo", k=2)
|
|
assert output == TEST_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_from_existing(texts: List[str]) -> None:
|
|
"""Test adding a new document"""
|
|
table_name = "test_singlestoredb_from_existing"
|
|
drop(table_name)
|
|
SingleStoreDB.from_texts(
|
|
texts,
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
# Test creating from an existing
|
|
docsearch2 = SingleStoreDB(
|
|
NormilizedFakeEmbeddings(),
|
|
table_name="test_singlestoredb_from_existing",
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch2.similarity_search("foo", k=1)
|
|
assert output == TEST_SINGLE_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_from_documents(texts: List[str]) -> None:
|
|
"""Test from_documents constructor."""
|
|
table_name = "test_singlestoredb_from_documents"
|
|
drop(table_name)
|
|
docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == TEST_SINGLE_WITH_METADATA_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_add_texts_to_existing(texts: List[str]) -> None:
|
|
"""Test adding a new document"""
|
|
table_name = "test_singlestoredb_add_texts_to_existing"
|
|
drop(table_name)
|
|
# Test creating from an existing
|
|
SingleStoreDB.from_texts(
|
|
texts,
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
docsearch = SingleStoreDB(
|
|
NormilizedFakeEmbeddings(),
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
docsearch.add_texts(["foo"])
|
|
output = docsearch.similarity_search("foo", k=2)
|
|
assert output == TEST_RESULT
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata(texts: List[str]) -> None:
|
|
"""Test filtering by metadata"""
|
|
table_name = "test_singlestoredb_filter_metadata"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1, filter={"index": 2})
|
|
assert output == [Document(page_content="baz", metadata={"index": 2})]
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None:
|
|
"""Test filtering by metadata field that is similar for each document"""
|
|
table_name = "test_singlestoredb_filter_metadata_2"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(page_content=t, metadata={"index": i, "category": "budget"})
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1, filter={"category": "budget"})
|
|
assert output == [
|
|
Document(page_content="foo", metadata={"index": 0, "category": "budget"})
|
|
]
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None:
|
|
"""Test filtering by two metadata fields"""
|
|
table_name = "test_singlestoredb_filter_metadata_3"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(page_content=t, metadata={"index": i, "category": "budget"})
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search(
|
|
"foo", k=1, filter={"category": "budget", "index": 1}
|
|
)
|
|
assert output == [
|
|
Document(page_content="bar", metadata={"index": 1, "category": "budget"})
|
|
]
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_4(texts: List[str]) -> None:
|
|
"""Test no matches"""
|
|
table_name = "test_singlestoredb_filter_metadata_4"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(page_content=t, metadata={"index": i, "category": "budget"})
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1, filter={"category": "vacation"})
|
|
assert output == []
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None:
|
|
"""Test complex metadata path"""
|
|
table_name = "test_singlestoredb_filter_metadata_5"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(
|
|
page_content=t,
|
|
metadata={
|
|
"index": i,
|
|
"category": "budget",
|
|
"subfield": {"subfield": {"idx": i, "other_idx": i + 1}},
|
|
},
|
|
)
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search(
|
|
"foo", k=1, filter={"category": "budget", "subfield": {"subfield": {"idx": 2}}}
|
|
)
|
|
assert output == [
|
|
Document(
|
|
page_content="baz",
|
|
metadata={
|
|
"index": 2,
|
|
"category": "budget",
|
|
"subfield": {"subfield": {"idx": 2, "other_idx": 3}},
|
|
},
|
|
)
|
|
]
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None:
|
|
"""Test filtering by other bool"""
|
|
table_name = "test_singlestoredb_filter_metadata_6"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(
|
|
page_content=t,
|
|
metadata={"index": i, "category": "budget", "is_good": i == 1},
|
|
)
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search(
|
|
"foo", k=1, filter={"category": "budget", "is_good": True}
|
|
)
|
|
assert output == [
|
|
Document(
|
|
page_content="bar",
|
|
metadata={"index": 1, "category": "budget", "is_good": True},
|
|
)
|
|
]
|
|
drop(table_name)
|
|
|
|
|
|
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
|
|
def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None:
|
|
"""Test filtering by float"""
|
|
table_name = "test_singlestoredb_filter_metadata_7"
|
|
drop(table_name)
|
|
docs = [
|
|
Document(
|
|
page_content=t,
|
|
metadata={"index": i, "category": "budget", "score": i + 0.5},
|
|
)
|
|
for i, t in enumerate(texts)
|
|
]
|
|
docsearch = SingleStoreDB.from_documents(
|
|
docs,
|
|
FakeEmbeddings(),
|
|
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
|
|
table_name=table_name,
|
|
host=TEST_SINGLESTOREDB_URL,
|
|
)
|
|
output = docsearch.similarity_search(
|
|
"bar", k=1, filter={"category": "budget", "score": 2.5}
|
|
)
|
|
assert output == [
|
|
Document(
|
|
page_content="baz",
|
|
metadata={"index": 2, "category": "budget", "score": 2.5},
|
|
)
|
|
]
|
|
drop(table_name)
|