mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
5f4552391f
# Add SKLearnVectorStore This PR adds SKLearnVectorStore, a simply vector store based on NearestNeighbors implementations in the scikit-learn package. This provides a simple drop-in vector store implementation with minimal dependencies (scikit-learn is typically installed in a data scientist / ml engineer environment). The vector store can be persisted and loaded from json, bson and parquet format. SKLearnVectorStore has soft (dynamic) dependency on the scikit-learn, numpy and pandas packages. Persisting to bson requires the bson package, persisting to parquet requires the pyarrow package. ## Before submitting Integration tests are provided under `tests/integration_tests/vectorstores/test_sklearn.py` Sample usage notebook is provided under `docs/modules/indexes/vectorstores/examples/sklear.ipynb` Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
77 lines
2.5 KiB
Python
77 lines
2.5 KiB
Python
"""Test SKLearnVectorStore functionality."""
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from langchain.vectorstores import SKLearnVectorStore
|
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
|
|
|
|
|
@pytest.mark.requires("numpy", "sklearn")
|
|
def test_sklearn() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = SKLearnVectorStore.from_texts(texts, embedding=FakeEmbeddings())
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert len(output) == 1
|
|
assert output[0].page_content == "foo"
|
|
|
|
|
|
@pytest.mark.requires("numpy", "sklearn")
|
|
def test_sklearn_with_metadatas() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = SKLearnVectorStore.from_texts(
|
|
texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output[0].metadata["page"] == "0"
|
|
|
|
|
|
@pytest.mark.requires("numpy", "sklearn")
|
|
def test_sklearn_with_metadatas_with_scores() -> None:
|
|
"""Test end to end construction and scored search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = SKLearnVectorStore.from_texts(
|
|
texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search_with_relevance_scores("foo", k=1)
|
|
assert len(output) == 1
|
|
doc, score = output[0]
|
|
assert doc.page_content == "foo"
|
|
assert doc.metadata["page"] == "0"
|
|
assert score == 1
|
|
|
|
|
|
@pytest.mark.requires("numpy", "sklearn")
|
|
def test_sklearn_with_persistence(tmpdir: Path) -> None:
|
|
"""Test end to end construction and search, with persistence."""
|
|
persist_path = tmpdir / "foo.parquet"
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = SKLearnVectorStore.from_texts(
|
|
texts,
|
|
FakeEmbeddings(),
|
|
persist_path=str(persist_path),
|
|
serializer="json",
|
|
)
|
|
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert len(output) == 1
|
|
assert output[0].page_content == "foo"
|
|
|
|
docsearch.persist()
|
|
|
|
# Get a new VectorStore from the persisted directory
|
|
docsearch = SKLearnVectorStore(
|
|
embedding=FakeEmbeddings(), persist_path=str(persist_path), serializer="json"
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert len(output) == 1
|
|
assert output[0].page_content == "foo"
|