langchain/tests/integration_tests/vectorstores/test_chroma.py
Raymond Yuan 5171c3bcca
Refactor vector storage to correctly handle relevancy scores (#6570)
Description: This pull request aims to support generating the correct
generic relevancy scores for different vector stores by refactoring the
relevance score functions and their selection in the base class and
subclasses of VectorStore. This is especially relevant with VectorStores
that require a distance metric upon initialization. Note many of the
current implenetations of `_similarity_search_with_relevance_scores` are
not technically correct, as they just return
`self.similarity_search_with_score(query, k, **kwargs)` without applying
the relevant score function

Also includes changes associated with:
https://github.com/hwchase17/langchain/pull/6564 and
https://github.com/hwchase17/langchain/pull/6494

See more indepth discussion in thread in #6494 

Issue: 
https://github.com/hwchase17/langchain/issues/6526
https://github.com/hwchase17/langchain/issues/6481
https://github.com/hwchase17/langchain/issues/6346

Dependencies: None

The changes include:
- Properly handling score thresholding in FAISS
`similarity_search_with_score_by_vector` for the corresponding distance
metric.
- Refactoring the `_similarity_search_with_relevance_scores` method in
the base class and removing it from the subclasses for incorrectly
implemented subclasses.
- Adding a `_select_relevance_score_fn` method in the base class and
implementing it in the subclasses to select the appropriate relevance
score function based on the distance strategy.
- Updating the `__init__` methods of the subclasses to set the
`relevance_score_fn` attribute.
- Removing the `_default_relevance_score_fn` function from the FAISS
class and using the base class's `_euclidean_relevance_score_fn`
instead.
- Adding the `DistanceStrategy` enum to the `utils.py` file and updating
the imports in the vector store classes.
- Updating the tests to import the `DistanceStrategy` enum from the
`utils.py` file.

---------

Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
2023-07-10 20:37:03 -07:00

270 lines
9.6 KiB
Python

"""Test Chroma functionality."""
import pytest
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
FakeEmbeddings,
)
def test_chroma() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Chroma.from_texts(
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
@pytest.mark.asyncio
async def test_chroma_async() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Chroma.from_texts(
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
)
output = await docsearch.asimilarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_chroma_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
def test_chroma_with_metadatas_with_scores() -> None:
"""Test end to end construction and scored search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search_with_score("foo", k=1)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
def test_chroma_with_metadatas_with_scores_using_vector() -> None:
"""Test end to end construction and scored search, using embedding vector."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = FakeEmbeddings()
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=embeddings,
metadatas=metadatas,
)
embedded_query = embeddings.embed_query("foo")
output = docsearch.similarity_search_by_vector_with_relevance_scores(
embedding=embedded_query, k=1
)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
def test_chroma_search_filter() -> None:
"""Test end to end construction and search with metadata filtering."""
texts = ["far", "bar", "baz"]
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"})
assert output == [Document(page_content="far", metadata={"first_letter": "f"})]
output = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"})
assert output == [Document(page_content="bar", metadata={"first_letter": "b"})]
def test_chroma_search_filter_with_scores() -> None:
"""Test end to end construction and scored search with metadata filtering."""
texts = ["far", "bar", "baz"]
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search_with_score(
"far", k=1, filter={"first_letter": "f"}
)
assert output == [
(Document(page_content="far", metadata={"first_letter": "f"}), 0.0)
]
output = docsearch.similarity_search_with_score(
"far", k=1, filter={"first_letter": "b"}
)
assert output == [
(Document(page_content="bar", metadata={"first_letter": "b"}), 1.0)
]
def test_chroma_with_persistence() -> None:
"""Test end to end construction and search, with persistence."""
chroma_persist_dir = "./tests/persist_dir"
collection_name = "test_collection"
texts = ["foo", "bar", "baz"]
docsearch = Chroma.from_texts(
collection_name=collection_name,
texts=texts,
embedding=FakeEmbeddings(),
persist_directory=chroma_persist_dir,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch.persist()
# Get a new VectorStore from the persisted directory
docsearch = Chroma(
collection_name=collection_name,
embedding_function=FakeEmbeddings(),
persist_directory=chroma_persist_dir,
)
output = docsearch.similarity_search("foo", k=1)
# Clean up
docsearch.delete_collection()
# Persist doesn't need to be called again
# Data will be automatically persisted on object deletion
# Or on program exit
def test_chroma_mmr() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = Chroma.from_texts(
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
)
output = docsearch.max_marginal_relevance_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_chroma_mmr_by_vector() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
embeddings = FakeEmbeddings()
docsearch = Chroma.from_texts(
collection_name="test_collection", texts=texts, embedding=embeddings
)
embedded_query = embeddings.embed_query("foo")
output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
assert output == [Document(page_content="foo")]
def test_chroma_with_include_parameter() -> None:
"""Test end to end construction and include parameter."""
texts = ["foo", "bar", "baz"]
docsearch = Chroma.from_texts(
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
)
output = docsearch.get(include=["embeddings"])
assert output["embeddings"] is not None
output = docsearch.get()
assert output["embeddings"] is None
def test_chroma_update_document() -> None:
"""Test the update_document function in the Chroma class."""
# Make a consistent embedding
embedding = ConsistentFakeEmbeddings()
# Initial document content and id
initial_content = "foo"
document_id = "doc1"
# Create an instance of Document with initial content and metadata
original_doc = Document(page_content=initial_content, metadata={"page": "0"})
# Initialize a Chroma instance with the original document
docsearch = Chroma.from_documents(
collection_name="test_collection",
documents=[original_doc],
embedding=embedding,
ids=[document_id],
)
old_embedding = docsearch._collection.peek()["embeddings"][
docsearch._collection.peek()["ids"].index(document_id)
]
# Define updated content for the document
updated_content = "updated foo"
# Create a new Document instance with the updated content and the same id
updated_doc = Document(page_content=updated_content, metadata={"page": "0"})
# Update the document in the Chroma instance
docsearch.update_document(document_id=document_id, document=updated_doc)
# Perform a similarity search with the updated content
output = docsearch.similarity_search(updated_content, k=1)
# Assert that the updated document is returned by the search
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
# Assert that the new embedding is correct
new_embedding = docsearch._collection.peek()["embeddings"][
docsearch._collection.peek()["ids"].index(document_id)
]
assert new_embedding == embedding.embed_documents([updated_content])[0]
assert new_embedding != old_embedding
def test_chroma_with_relevance_score() -> None:
"""Test to make sure the relevance score is scaled to 0-1."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
collection_metadata={"hnsw:space": "l2"},
)
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
assert output == [
(Document(page_content="foo", metadata={"page": "0"}), 1.0),
(Document(page_content="bar", metadata={"page": "1"}), 0.8),
(Document(page_content="baz", metadata={"page": "2"}), 0.5),
]
def test_chroma_with_relevance_score_custom_normalization_fn() -> None:
"""Test searching with relevance score and custom normalization function."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Chroma.from_texts(
collection_name="test_collection",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
relevance_score_fn=lambda d: d * 0,
collection_metadata={"hnsw:space": "l2"},
)
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
assert output == [
(Document(page_content="foo", metadata={"page": "0"}), -0.0),
(Document(page_content="bar", metadata={"page": "1"}), -0.0),
(Document(page_content="baz", metadata={"page": "2"}), -0.0),
]