langchain/tests/unit_tests/vectorstores/test_sklearn.py

"""Test SKLearnVectorStore functionality."""
from pathlib import Path

import pytest

from langchain.vectorstores import SKLearnVectorStore
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    docsearch = SKLearnVectorStore.from_texts(texts, FakeEmbeddings())
    output = docsearch.similarity_search("foo", k=1)
    assert len(output) == 1
    assert output[0].page_content == "foo"


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_metadatas() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    metadatas = [{"page": str(i)} for i in range(len(texts))]
    docsearch = SKLearnVectorStore.from_texts(
        texts,
        FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search("foo", k=1)
    assert output[0].metadata["page"] == "0"


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_metadatas_with_scores() -> None:
    """Test end to end construction and scored search."""
    texts = ["foo", "bar", "baz"]
    metadatas = [{"page": str(i)} for i in range(len(texts))]
    docsearch = SKLearnVectorStore.from_texts(
        texts,
        FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search_with_relevance_scores("foo", k=1)
    assert len(output) == 1
    doc, score = output[0]
    assert doc.page_content == "foo"
    assert doc.metadata["page"] == "0"
    assert score == 1


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_with_persistence(tmpdir: Path) -> None:
    """Test end to end construction and search, with persistence."""
    persist_path = tmpdir / "foo.parquet"
    texts = ["foo", "bar", "baz"]
    docsearch = SKLearnVectorStore.from_texts(
        texts,
        FakeEmbeddings(),
        persist_path=str(persist_path),
        serializer="json",
    )

    output = docsearch.similarity_search("foo", k=1)
    assert len(output) == 1
    assert output[0].page_content == "foo"

    docsearch.persist()

    # Get a new VectorStore from the persisted directory
    docsearch = SKLearnVectorStore(
        FakeEmbeddings(), persist_path=str(persist_path), serializer="json"
    )
    output = docsearch.similarity_search("foo", k=1)
    assert len(output) == 1
    assert output[0].page_content == "foo"


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_mmr() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    docsearch = SKLearnVectorStore.from_texts(texts, FakeEmbeddings())
    output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3)
    assert len(output) == 1
    assert output[0].page_content == "foo"


@pytest.mark.requires("numpy", "sklearn")
def test_sklearn_mmr_by_vector() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    embeddings = FakeEmbeddings()
    docsearch = SKLearnVectorStore.from_texts(texts, embeddings)
    embedded_query = embeddings.embed_query("foo")
    output = docsearch.max_marginal_relevance_search_by_vector(
        embedded_query, k=1, fetch_k=3
    )
    assert len(output) == 1
    assert output[0].page_content == "foo"