mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
5171c3bcca
Description: This pull request aims to support generating the correct generic relevancy scores for different vector stores by refactoring the relevance score functions and their selection in the base class and subclasses of VectorStore. This is especially relevant with VectorStores that require a distance metric upon initialization. Note many of the current implenetations of `_similarity_search_with_relevance_scores` are not technically correct, as they just return `self.similarity_search_with_score(query, k, **kwargs)` without applying the relevant score function Also includes changes associated with: https://github.com/hwchase17/langchain/pull/6564 and https://github.com/hwchase17/langchain/pull/6494 See more indepth discussion in thread in #6494 Issue: https://github.com/hwchase17/langchain/issues/6526 https://github.com/hwchase17/langchain/issues/6481 https://github.com/hwchase17/langchain/issues/6346 Dependencies: None The changes include: - Properly handling score thresholding in FAISS `similarity_search_with_score_by_vector` for the corresponding distance metric. - Refactoring the `_similarity_search_with_relevance_scores` method in the base class and removing it from the subclasses for incorrectly implemented subclasses. - Adding a `_select_relevance_score_fn` method in the base class and implementing it in the subclasses to select the appropriate relevance score function based on the distance strategy. - Updating the `__init__` methods of the subclasses to set the `relevance_score_fn` attribute. - Removing the `_default_relevance_score_fn` function from the FAISS class and using the base class's `_euclidean_relevance_score_fn` instead. - Adding the `DistanceStrategy` enum to the `utils.py` file and updating the imports in the vector store classes. - Updating the tests to import the `DistanceStrategy` enum from the `utils.py` file. --------- Co-authored-by: Hanit <37485638+hanit-com@users.noreply.github.com>
270 lines
9.6 KiB
Python
270 lines
9.6 KiB
Python
"""Test Chroma functionality."""
|
|
import pytest
|
|
|
|
from langchain.docstore.document import Document
|
|
from langchain.vectorstores import Chroma
|
|
from tests.integration_tests.vectorstores.fake_embeddings import (
|
|
ConsistentFakeEmbeddings,
|
|
FakeEmbeddings,
|
|
)
|
|
|
|
|
|
def test_chroma() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_chroma_async() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
|
|
)
|
|
output = await docsearch.asimilarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
def test_chroma_with_metadatas() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
|
|
|
|
|
def test_chroma_with_metadatas_with_scores() -> None:
|
|
"""Test end to end construction and scored search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search_with_score("foo", k=1)
|
|
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
|
|
|
|
|
def test_chroma_with_metadatas_with_scores_using_vector() -> None:
|
|
"""Test end to end construction and scored search, using embedding vector."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
embeddings = FakeEmbeddings()
|
|
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=embeddings,
|
|
metadatas=metadatas,
|
|
)
|
|
embedded_query = embeddings.embed_query("foo")
|
|
output = docsearch.similarity_search_by_vector_with_relevance_scores(
|
|
embedding=embedded_query, k=1
|
|
)
|
|
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
|
|
|
|
|
def test_chroma_search_filter() -> None:
|
|
"""Test end to end construction and search with metadata filtering."""
|
|
texts = ["far", "bar", "baz"]
|
|
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"})
|
|
assert output == [Document(page_content="far", metadata={"first_letter": "f"})]
|
|
output = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"})
|
|
assert output == [Document(page_content="bar", metadata={"first_letter": "b"})]
|
|
|
|
|
|
def test_chroma_search_filter_with_scores() -> None:
|
|
"""Test end to end construction and scored search with metadata filtering."""
|
|
texts = ["far", "bar", "baz"]
|
|
metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
)
|
|
output = docsearch.similarity_search_with_score(
|
|
"far", k=1, filter={"first_letter": "f"}
|
|
)
|
|
assert output == [
|
|
(Document(page_content="far", metadata={"first_letter": "f"}), 0.0)
|
|
]
|
|
output = docsearch.similarity_search_with_score(
|
|
"far", k=1, filter={"first_letter": "b"}
|
|
)
|
|
assert output == [
|
|
(Document(page_content="bar", metadata={"first_letter": "b"}), 1.0)
|
|
]
|
|
|
|
|
|
def test_chroma_with_persistence() -> None:
|
|
"""Test end to end construction and search, with persistence."""
|
|
chroma_persist_dir = "./tests/persist_dir"
|
|
collection_name = "test_collection"
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name=collection_name,
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
persist_directory=chroma_persist_dir,
|
|
)
|
|
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
docsearch.persist()
|
|
|
|
# Get a new VectorStore from the persisted directory
|
|
docsearch = Chroma(
|
|
collection_name=collection_name,
|
|
embedding_function=FakeEmbeddings(),
|
|
persist_directory=chroma_persist_dir,
|
|
)
|
|
output = docsearch.similarity_search("foo", k=1)
|
|
|
|
# Clean up
|
|
docsearch.delete_collection()
|
|
|
|
# Persist doesn't need to be called again
|
|
# Data will be automatically persisted on object deletion
|
|
# Or on program exit
|
|
|
|
|
|
def test_chroma_mmr() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
|
|
)
|
|
output = docsearch.max_marginal_relevance_search("foo", k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
def test_chroma_mmr_by_vector() -> None:
|
|
"""Test end to end construction and search."""
|
|
texts = ["foo", "bar", "baz"]
|
|
embeddings = FakeEmbeddings()
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection", texts=texts, embedding=embeddings
|
|
)
|
|
embedded_query = embeddings.embed_query("foo")
|
|
output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
|
|
assert output == [Document(page_content="foo")]
|
|
|
|
|
|
def test_chroma_with_include_parameter() -> None:
|
|
"""Test end to end construction and include parameter."""
|
|
texts = ["foo", "bar", "baz"]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
|
|
)
|
|
output = docsearch.get(include=["embeddings"])
|
|
assert output["embeddings"] is not None
|
|
output = docsearch.get()
|
|
assert output["embeddings"] is None
|
|
|
|
|
|
def test_chroma_update_document() -> None:
|
|
"""Test the update_document function in the Chroma class."""
|
|
# Make a consistent embedding
|
|
embedding = ConsistentFakeEmbeddings()
|
|
|
|
# Initial document content and id
|
|
initial_content = "foo"
|
|
document_id = "doc1"
|
|
|
|
# Create an instance of Document with initial content and metadata
|
|
original_doc = Document(page_content=initial_content, metadata={"page": "0"})
|
|
|
|
# Initialize a Chroma instance with the original document
|
|
docsearch = Chroma.from_documents(
|
|
collection_name="test_collection",
|
|
documents=[original_doc],
|
|
embedding=embedding,
|
|
ids=[document_id],
|
|
)
|
|
old_embedding = docsearch._collection.peek()["embeddings"][
|
|
docsearch._collection.peek()["ids"].index(document_id)
|
|
]
|
|
|
|
# Define updated content for the document
|
|
updated_content = "updated foo"
|
|
|
|
# Create a new Document instance with the updated content and the same id
|
|
updated_doc = Document(page_content=updated_content, metadata={"page": "0"})
|
|
|
|
# Update the document in the Chroma instance
|
|
docsearch.update_document(document_id=document_id, document=updated_doc)
|
|
|
|
# Perform a similarity search with the updated content
|
|
output = docsearch.similarity_search(updated_content, k=1)
|
|
|
|
# Assert that the updated document is returned by the search
|
|
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
|
|
|
|
# Assert that the new embedding is correct
|
|
new_embedding = docsearch._collection.peek()["embeddings"][
|
|
docsearch._collection.peek()["ids"].index(document_id)
|
|
]
|
|
assert new_embedding == embedding.embed_documents([updated_content])[0]
|
|
assert new_embedding != old_embedding
|
|
|
|
|
|
def test_chroma_with_relevance_score() -> None:
|
|
"""Test to make sure the relevance score is scaled to 0-1."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
collection_metadata={"hnsw:space": "l2"},
|
|
)
|
|
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
|
assert output == [
|
|
(Document(page_content="foo", metadata={"page": "0"}), 1.0),
|
|
(Document(page_content="bar", metadata={"page": "1"}), 0.8),
|
|
(Document(page_content="baz", metadata={"page": "2"}), 0.5),
|
|
]
|
|
|
|
|
|
def test_chroma_with_relevance_score_custom_normalization_fn() -> None:
|
|
"""Test searching with relevance score and custom normalization function."""
|
|
texts = ["foo", "bar", "baz"]
|
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
|
docsearch = Chroma.from_texts(
|
|
collection_name="test_collection",
|
|
texts=texts,
|
|
embedding=FakeEmbeddings(),
|
|
metadatas=metadatas,
|
|
relevance_score_fn=lambda d: d * 0,
|
|
collection_metadata={"hnsw:space": "l2"},
|
|
)
|
|
output = docsearch.similarity_search_with_relevance_scores("foo", k=3)
|
|
assert output == [
|
|
(Document(page_content="foo", metadata={"page": "0"}), -0.0),
|
|
(Document(page_content="bar", metadata={"page": "1"}), -0.0),
|
|
(Document(page_content="baz", metadata={"page": "2"}), -0.0),
|
|
]
|