langchain/libs/community/tests/integration_tests/vectorstores/test_vdms.py

"""Test VDMS functionality."""
from __future__ import annotations

import logging
import os
from typing import TYPE_CHECKING

import pytest
from langchain_core.documents import Document

from langchain_community.vectorstores import VDMS
from langchain_community.vectorstores.vdms import VDMS_Client, embedding2bytes
from tests.integration_tests.vectorstores.fake_embeddings import (
    ConsistentFakeEmbeddings,
    FakeEmbeddings,
)

if TYPE_CHECKING:
    import vdms

logging.basicConfig(level=logging.DEBUG)


# The connection string matches the default settings in the docker-compose file
# located in the root of the repository: [root]/docker/docker-compose.yml
# To spin up a detached VDMS server:
# cd [root]/docker
# docker compose up -d vdms
@pytest.fixture
def vdms_client() -> vdms.vdms:
    return VDMS_Client(
        host=os.getenv("VDMS_DBHOST", "localhost"),
        port=int(os.getenv("VDMS_DBPORT", 6025)),
    )


@pytest.mark.requires("vdms")
def test_init_from_client(vdms_client: vdms.vdms) -> None:
    embedding_function = FakeEmbeddings()
    _ = VDMS(
        embedding_function=embedding_function,
        client=vdms_client,
    )


@pytest.mark.requires("vdms")
def test_from_texts_with_metadatas(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and search."""
    collection_name = "test_from_texts_with_metadatas"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))]
    metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.similarity_search("foo", k=1)
    assert output == [
        Document(page_content="foo", metadata={"page": "1", "id": ids[0]})
    ]


@pytest.mark.requires("vdms")
def test_from_texts_with_metadatas_with_scores(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and scored search."""
    collection_name = "test_from_texts_with_metadatas_with_scores"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_from_texts_with_metadatas_with_scores_{i}" for i in range(len(texts))]
    metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.similarity_search_with_score("foo", k=1)
    assert output == [
        (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0)
    ]


@pytest.mark.requires("vdms")
def test_from_texts_with_metadatas_with_scores_using_vector(
    vdms_client: vdms.vdms,
) -> None:
    """Test end to end construction and scored search, using embedding vector."""
    collection_name = "test_from_texts_with_metadatas_with_scores_using_vector"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))]
    metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch._similarity_search_with_relevance_scores("foo", k=1)
    assert output == [
        (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0)
    ]


@pytest.mark.requires("vdms")
def test_search_filter(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and search with metadata filtering."""
    collection_name = "test_search_filter"
    embedding_function = FakeEmbeddings()
    texts = ["far", "bar", "baz"]
    ids = [f"test_search_filter_{i}" for i in range(len(texts))]
    metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.similarity_search(
        "far", k=1, filter={"first_letter": ["==", "f"]}
    )
    assert output == [
        Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]})
    ]
    output = docsearch.similarity_search(
        "far", k=2, filter={"first_letter": ["==", "b"]}
    )
    assert output == [
        Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}),
        Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}),
    ]


@pytest.mark.requires("vdms")
def test_search_filter_with_scores(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and scored search with metadata filtering."""
    collection_name = "test_search_filter_with_scores"
    embedding_function = FakeEmbeddings()
    texts = ["far", "bar", "baz"]
    ids = [f"test_search_filter_with_scores_{i}" for i in range(len(texts))]
    metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.similarity_search_with_score(
        "far", k=1, filter={"first_letter": ["==", "f"]}
    )
    assert output == [
        (
            Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}),
            0.0,
        )
    ]

    output = docsearch.similarity_search_with_score(
        "far", k=2, filter={"first_letter": ["==", "b"]}
    )
    assert output == [
        (
            Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}),
            1.0,
        ),
        (
            Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}),
            4.0,
        ),
    ]


@pytest.mark.requires("vdms")
def test_mmr(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and search."""
    collection_name = "test_mmr"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_mmr_{i}" for i in range(len(texts))]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.max_marginal_relevance_search("foo", k=1)
    assert output == [Document(page_content="foo", metadata={"id": ids[0]})]


@pytest.mark.requires("vdms")
def test_mmr_by_vector(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and search."""
    collection_name = "test_mmr_by_vector"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_mmr_by_vector_{i}" for i in range(len(texts))]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        collection_name=collection_name,
        client=vdms_client,
    )
    embedded_query = embedding_function.embed_query("foo")
    output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
    assert output == [Document(page_content="foo", metadata={"id": ids[0]})]


@pytest.mark.requires("vdms")
def test_with_include_parameter(vdms_client: vdms.vdms) -> None:
    """Test end to end construction and include parameter."""
    collection_name = "test_with_include_parameter"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    docsearch = VDMS.from_texts(
        texts=texts,
        embedding=embedding_function,
        collection_name=collection_name,
        client=vdms_client,
    )
    response, response_array = docsearch.get(collection_name, include=["embeddings"])
    assert response_array != []
    response, response_array = docsearch.get(collection_name)
    assert response_array == []


@pytest.mark.requires("vdms")
def test_update_document(vdms_client: vdms.vdms) -> None:
    """Test the update_document function in the VDMS class."""
    collection_name = "test_update_document"

    # Make a consistent embedding
    embedding_function = ConsistentFakeEmbeddings()

    # Initial document content and id
    initial_content = "foo"
    document_id = "doc1"

    # Create an instance of Document with initial content and metadata
    original_doc = Document(page_content=initial_content, metadata={"page": "1"})

    # Initialize a VDMS instance with the original document
    docsearch = VDMS.from_documents(
        client=vdms_client,
        collection_name=collection_name,
        documents=[original_doc],
        embedding=embedding_function,
        ids=[document_id],
    )
    response, old_embedding = docsearch.get(
        collection_name,
        constraints={"id": ["==", document_id]},
        include=["metadata", "embeddings"],
    )
    # old_embedding = response_array[0]

    # Define updated content for the document
    updated_content = "updated foo"

    # Create a new Document instance with the updated content and the same id
    updated_doc = Document(page_content=updated_content, metadata={"page": "1"})

    # Update the document in the VDMS instance
    docsearch.update_document(
        collection_name, document_id=document_id, document=updated_doc
    )

    # Perform a similarity search with the updated content
    output = docsearch.similarity_search(updated_content, k=1)

    # Assert that the updated document is returned by the search
    assert output == [
        Document(
            page_content=updated_content, metadata={"page": "1", "id": document_id}
        )
    ]

    # Assert that the new embedding is correct
    response, new_embedding = docsearch.get(
        collection_name,
        constraints={"id": ["==", document_id]},
        include=["metadata", "embeddings"],
    )
    # new_embedding = response_array[0]

    assert new_embedding[0] == embedding2bytes(
        embedding_function.embed_documents([updated_content])[0]
    )
    assert new_embedding != old_embedding


@pytest.mark.requires("vdms")
def test_with_relevance_score(vdms_client: vdms.vdms) -> None:
    """Test to make sure the relevance score is scaled to 0-1."""
    collection_name = "test_with_relevance_score"
    embedding_function = FakeEmbeddings()
    texts = ["foo", "bar", "baz"]
    ids = [f"test_relevance_scores_{i}" for i in range(len(texts))]
    metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)]
    docsearch = VDMS.from_texts(
        texts=texts,
        ids=ids,
        embedding=embedding_function,
        metadatas=metadatas,
        collection_name=collection_name,
        client=vdms_client,
    )
    output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
    assert output == [
        (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0),
        (Document(page_content="bar", metadata={"page": "2", "id": ids[1]}), 0.25),
        (Document(page_content="baz", metadata={"page": "3", "id": ids[2]}), 1.0),
    ]


@pytest.mark.requires("vdms")
def test_add_documents_no_metadata(vdms_client: vdms.vdms) -> None:
    collection_name = "test_add_documents_no_metadata"
    embedding_function = FakeEmbeddings()
    db = VDMS(
        collection_name=collection_name,
        embedding_function=embedding_function,
        client=vdms_client,
    )
    db.add_documents([Document(page_content="foo")])


@pytest.mark.requires("vdms")
def test_add_documents_mixed_metadata(vdms_client: vdms.vdms) -> None:
    collection_name = "test_add_documents_mixed_metadata"
    embedding_function = FakeEmbeddings()
    db = VDMS(
        collection_name=collection_name,
        embedding_function=embedding_function,
        client=vdms_client,
    )

    docs = [
        Document(page_content="foo"),
        Document(page_content="bar", metadata={"baz": 1}),
    ]
    ids = ["10", "11"]
    actual_ids = db.add_documents(docs, ids=ids)
    assert actual_ids == ids

    search = db.similarity_search("foo bar", k=2)
    docs[0].metadata = {"id": ids[0]}
    docs[1].metadata["id"] = ids[1]
    assert sorted(search, key=lambda d: d.page_content) == sorted(
        docs, key=lambda d: d.page_content
    )