langchain/tests/unit_tests/vectorstores/test_sklearn.py
Janos Tolgyesi 5f4552391f
Add SKLearnVectorStore (#5305)
# Add SKLearnVectorStore

This PR adds SKLearnVectorStore, a simply vector store based on
NearestNeighbors implementations in the scikit-learn package. This
provides a simple drop-in vector store implementation with minimal
dependencies (scikit-learn is typically installed in a data scientist /
ml engineer environment). The vector store can be persisted and loaded
from json, bson and parquet format.

SKLearnVectorStore has soft (dynamic) dependency on the scikit-learn,
numpy and pandas packages. Persisting to bson requires the bson package,
persisting to parquet requires the pyarrow package.

## Before submitting

Integration tests are provided under
`tests/integration_tests/vectorstores/test_sklearn.py`

Sample usage notebook is provided under
`docs/modules/indexes/vectorstores/examples/sklear.ipynb`

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
2023-05-28 08:17:42 -07:00

77 lines
2.5 KiB
Python

"""Test SKLearnVectorStore functionality."""
from pathlib import Path
import pytest
from langchain.vectorstores import SKLearnVectorStore
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
@pytest.mark.requires("numpy", "sklearn")
def test_sklearn() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = SKLearnVectorStore.from_texts(texts, embedding=FakeEmbeddings())
output = docsearch.similarity_search("foo", k=1)
assert len(output) == 1
assert output[0].page_content == "foo"
@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = SKLearnVectorStore.from_texts(
texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search("foo", k=1)
assert output[0].metadata["page"] == "0"
@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_metadatas_with_scores() -> None:
"""Test end to end construction and scored search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = SKLearnVectorStore.from_texts(
texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search_with_relevance_scores("foo", k=1)
assert len(output) == 1
doc, score = output[0]
assert doc.page_content == "foo"
assert doc.metadata["page"] == "0"
assert score == 1
@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_persistence(tmpdir: Path) -> None:
"""Test end to end construction and search, with persistence."""
persist_path = tmpdir / "foo.parquet"
texts = ["foo", "bar", "baz"]
docsearch = SKLearnVectorStore.from_texts(
texts,
FakeEmbeddings(),
persist_path=str(persist_path),
serializer="json",
)
output = docsearch.similarity_search("foo", k=1)
assert len(output) == 1
assert output[0].page_content == "foo"
docsearch.persist()
# Get a new VectorStore from the persisted directory
docsearch = SKLearnVectorStore(
embedding=FakeEmbeddings(), persist_path=str(persist_path), serializer="json"
)
output = docsearch.similarity_search("foo", k=1)
assert len(output) == 1
assert output[0].page_content == "foo"