langchain/tests/integration_tests/vectorstores/test_chroma.py

"""Test Chroma functionality."""
import pytest

from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings


def test_chroma() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    docsearch = Chroma.from_texts(
        collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
    )
    output = docsearch.similarity_search("foo", k=1)
    assert output == [Document(page_content="foo")]


@pytest.mark.asyncio
async def test_chroma_async() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    docsearch = Chroma.from_texts(
        collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
    )
    output = await docsearch.asimilarity_search("foo", k=1)
    assert output == [Document(page_content="foo")]


def test_chroma_with_metadatas() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    metadatas = [{"page": str(i)} for i in range(len(texts))]
    docsearch = Chroma.from_texts(
        collection_name="test_collection",
        texts=texts,
        embedding=FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search("foo", k=1)
    assert output == [Document(page_content="foo", metadata={"page": "0"})]


def test_chroma_with_metadatas_with_scores() -> None:
    """Test end to end construction and scored search."""
    texts = ["foo", "bar", "baz"]
    metadatas = [{"page": str(i)} for i in range(len(texts))]
    docsearch = Chroma.from_texts(
        collection_name="test_collection",
        texts=texts,
        embedding=FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search_with_score("foo", k=1)
    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]


def test_chroma_search_filter() -> None:
    """Test end to end construction and search with metadata filtering."""
    texts = ["far", "bar", "baz"]
    metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
    docsearch = Chroma.from_texts(
        collection_name="test_collection",
        texts=texts,
        embedding=FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"})
    assert output == [Document(page_content="far", metadata={"first_letter": "f"})]
    output = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"})
    assert output == [Document(page_content="bar", metadata={"first_letter": "b"})]


def test_chroma_search_filter_with_scores() -> None:
    """Test end to end construction and scored search with metadata filtering."""
    texts = ["far", "bar", "baz"]
    metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]
    docsearch = Chroma.from_texts(
        collection_name="test_collection",
        texts=texts,
        embedding=FakeEmbeddings(),
        metadatas=metadatas,
    )
    output = docsearch.similarity_search_with_score(
        "far", k=1, filter={"first_letter": "f"}
    )
    assert output == [
        (Document(page_content="far", metadata={"first_letter": "f"}), 0.0)
    ]
    output = docsearch.similarity_search_with_score(
        "far", k=1, filter={"first_letter": "b"}
    )
    assert output == [
        (Document(page_content="bar", metadata={"first_letter": "b"}), 1.0)
    ]


def test_chroma_with_persistence() -> None:
    """Test end to end construction and search, with persistence."""
    chroma_persist_dir = "./tests/persist_dir"
    collection_name = "test_collection"
    texts = ["foo", "bar", "baz"]
    docsearch = Chroma.from_texts(
        collection_name=collection_name,
        texts=texts,
        embedding=FakeEmbeddings(),
        persist_directory=chroma_persist_dir,
    )

    output = docsearch.similarity_search("foo", k=1)
    assert output == [Document(page_content="foo")]

    docsearch.persist()

    # Get a new VectorStore from the persisted directory
    docsearch = Chroma(
        collection_name=collection_name,
        embedding_function=FakeEmbeddings(),
        persist_directory=chroma_persist_dir,
    )
    output = docsearch.similarity_search("foo", k=1)

    # Clean up
    docsearch.delete_collection()

    # Persist doesn't need to be called again
    # Data will be automatically persisted on object deletion
    # Or on program exit


def test_chroma_mmr() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    docsearch = Chroma.from_texts(
        collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
    )
    output = docsearch.max_marginal_relevance_search("foo", k=1)
    assert output == [Document(page_content="foo")]


def test_chroma_mmr_by_vector() -> None:
    """Test end to end construction and search."""
    texts = ["foo", "bar", "baz"]
    embeddings = FakeEmbeddings()
    docsearch = Chroma.from_texts(
        collection_name="test_collection", texts=texts, embedding=embeddings
    )
    embedded_query = embeddings.embed_query("foo")
    output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)
    assert output == [Document(page_content="foo")]


def test_chroma_with_include_parameter() -> None:
    """Test end to end construction and include parameter."""
    texts = ["foo", "bar", "baz"]
    docsearch = Chroma.from_texts(
        collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()
    )
    output = docsearch.get(include=["embeddings"])
    assert output["embeddings"] is not None
    output = docsearch.get()
    assert output["embeddings"] is None


def test_chroma_update_document() -> None:
    """Test the update_document function in the Chroma class."""

    # Initial document content and id
    initial_content = "foo"
    document_id = "doc1"

    # Create an instance of Document with initial content and metadata
    original_doc = Document(page_content=initial_content, metadata={"page": "0"})

    # Initialize a Chroma instance with the original document
    docsearch = Chroma.from_documents(
        collection_name="test_collection",
        documents=[original_doc],
        embedding=FakeEmbeddings(),
        ids=[document_id],
    )

    # Define updated content for the document
    updated_content = "updated foo"

    # Create a new Document instance with the updated content and the same id
    updated_doc = Document(page_content=updated_content, metadata={"page": "0"})

    # Update the document in the Chroma instance
    docsearch.update_document(document_id=document_id, document=updated_doc)

    # Perform a similarity search with the updated content
    output = docsearch.similarity_search(updated_content, k=1)

    # Assert that the updated document is returned by the search
    assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
Chroma in LangChain (#1010) Chroma is a simple to use, open-source, zero-config, zero setup vectorstore. Simply `pip install chromadb`, and you're good to go. Out-of-the-box Chroma is suitable for most LangChain workloads, but is highly flexible. I tested to 1M embs on my M1 mac, with out issues and reasonably fast query times. Look out for future releases as we integrate more Chroma features with LangChain! 2023-02-13 01:43:48 +00:00			`"""Test Chroma functionality."""`
Add workaround for not having async vector store methods (#2733) This allows us to use the async API for the Retrieval chains, though it is not guaranteed to be thread safe. 2023-04-12 01:49:08 +00:00			`import pytest`

Chroma in LangChain (#1010) Chroma is a simple to use, open-source, zero-config, zero setup vectorstore. Simply `pip install chromadb`, and you're good to go. Out-of-the-box Chroma is suitable for most LangChain workloads, but is highly flexible. I tested to 1M embs on my M1 mac, with out issues and reasonably fast query times. Look out for future releases as we integrate more Chroma features with LangChain! 2023-02-13 01:43:48 +00:00			`from langchain.docstore.document import Document`
			`from langchain.vectorstores import Chroma`
			`from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings`


			`def test_chroma() -> None:`
			`"""Test end to end construction and search."""`
			`texts = ["foo", "bar", "baz"]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()`
			`)`
			`output = docsearch.similarity_search("foo", k=1)`
			`assert output == [Document(page_content="foo")]`


Add workaround for not having async vector store methods (#2733) This allows us to use the async API for the Retrieval chains, though it is not guaranteed to be thread safe. 2023-04-12 01:49:08 +00:00			`@pytest.mark.asyncio`
			`async def test_chroma_async() -> None:`
			`"""Test end to end construction and search."""`
			`texts = ["foo", "bar", "baz"]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()`
			`)`
			`output = await docsearch.asimilarity_search("foo", k=1)`
			`assert output == [Document(page_content="foo")]`


Chroma in LangChain (#1010) Chroma is a simple to use, open-source, zero-config, zero setup vectorstore. Simply `pip install chromadb`, and you're good to go. Out-of-the-box Chroma is suitable for most LangChain workloads, but is highly flexible. I tested to 1M embs on my M1 mac, with out issues and reasonably fast query times. Look out for future releases as we integrate more Chroma features with LangChain! 2023-02-13 01:43:48 +00:00			`def test_chroma_with_metadatas() -> None:`
			`"""Test end to end construction and search."""`
			`texts = ["foo", "bar", "baz"]`
			`metadatas = [{"page": str(i)} for i in range(len(texts))]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection",`
			`texts=texts,`
			`embedding=FakeEmbeddings(),`
			`metadatas=metadatas,`
			`)`
			`output = docsearch.similarity_search("foo", k=1)`
			`assert output == [Document(page_content="foo", metadata={"page": "0"})]`
Chroma persistence (#1028) This PR adds persistence to the Chroma vector store. Users can supply a `persist_directory` with any of the `Chroma` creation methods. If supplied, the store will be automatically persisted at that directory. If a user creates a new `Chroma` instance with the same persistence directory, it will get loaded up automatically. If they use `from_texts` or `from_documents` in this way, the documents will be loaded into the existing store. There is the chance of some funky behavior if the user passes a different embedding function from the one used to create the collection - we will make this easier in future updates. For now, we log a warning. 2023-02-14 05:09:06 +00:00

Harrison/similarity search chroma (#1434) Co-authored-by: shibuiwilliam <shibuiyusuke@gmail.com> 2023-03-04 16:10:15 +00:00			`def test_chroma_with_metadatas_with_scores() -> None:`
Propagate "filter" arg in Chroma similarity_search (#1869) Technically a duplicate fix to #1619 but with unit tests and a small documentation update - Propagate `filter` arg in Chroma `similarity_search` to delegated call to `similarity_search_with_score` - Add `filter` arg to `similarity_search_by_vector` - Clarify doc strings on FakeEmbeddings 2023-03-23 02:40:10 +00:00			`"""Test end to end construction and scored search."""`
Harrison/similarity search chroma (#1434) Co-authored-by: shibuiwilliam <shibuiyusuke@gmail.com> 2023-03-04 16:10:15 +00:00			`texts = ["foo", "bar", "baz"]`
			`metadatas = [{"page": str(i)} for i in range(len(texts))]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection",`
			`texts=texts,`
			`embedding=FakeEmbeddings(),`
			`metadatas=metadatas,`
			`)`
			`output = docsearch.similarity_search_with_score("foo", k=1)`
Propagate "filter" arg in Chroma similarity_search (#1869) Technically a duplicate fix to #1619 but with unit tests and a small documentation update - Propagate `filter` arg in Chroma `similarity_search` to delegated call to `similarity_search_with_score` - Add `filter` arg to `similarity_search_by_vector` - Clarify doc strings on FakeEmbeddings 2023-03-23 02:40:10 +00:00			`assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]`


			`def test_chroma_search_filter() -> None:`
			`"""Test end to end construction and search with metadata filtering."""`
			`texts = ["far", "bar", "baz"]`
			`metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection",`
			`texts=texts,`
			`embedding=FakeEmbeddings(),`
			`metadatas=metadatas,`
			`)`
			`output = docsearch.similarity_search("far", k=1, filter={"first_letter": "f"})`
			`assert output == [Document(page_content="far", metadata={"first_letter": "f"})]`
			`output = docsearch.similarity_search("far", k=1, filter={"first_letter": "b"})`
			`assert output == [Document(page_content="bar", metadata={"first_letter": "b"})]`


			`def test_chroma_search_filter_with_scores() -> None:`
			`"""Test end to end construction and scored search with metadata filtering."""`
			`texts = ["far", "bar", "baz"]`
			`metadatas = [{"first_letter": "{}".format(text[0])} for text in texts]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection",`
			`texts=texts,`
			`embedding=FakeEmbeddings(),`
			`metadatas=metadatas,`
			`)`
			`output = docsearch.similarity_search_with_score(`
			`"far", k=1, filter={"first_letter": "f"}`
			`)`
			`assert output == [`
			`(Document(page_content="far", metadata={"first_letter": "f"}), 0.0)`
			`]`
			`output = docsearch.similarity_search_with_score(`
			`"far", k=1, filter={"first_letter": "b"}`
			`)`
			`assert output == [`
			`(Document(page_content="bar", metadata={"first_letter": "b"}), 1.0)`
			`]`
Harrison/similarity search chroma (#1434) Co-authored-by: shibuiwilliam <shibuiyusuke@gmail.com> 2023-03-04 16:10:15 +00:00

Chroma persistence (#1028) This PR adds persistence to the Chroma vector store. Users can supply a `persist_directory` with any of the `Chroma` creation methods. If supplied, the store will be automatically persisted at that directory. If a user creates a new `Chroma` instance with the same persistence directory, it will get loaded up automatically. If they use `from_texts` or `from_documents` in this way, the documents will be loaded into the existing store. There is the chance of some funky behavior if the user passes a different embedding function from the one used to create the collection - we will make this easier in future updates. For now, we log a warning. 2023-02-14 05:09:06 +00:00			`def test_chroma_with_persistence() -> None:`
			`"""Test end to end construction and search, with persistence."""`
			`chroma_persist_dir = "./tests/persist_dir"`
			`collection_name = "test_collection"`
			`texts = ["foo", "bar", "baz"]`
			`docsearch = Chroma.from_texts(`
			`collection_name=collection_name,`
			`texts=texts,`
			`embedding=FakeEmbeddings(),`
			`persist_directory=chroma_persist_dir,`
			`)`

			`output = docsearch.similarity_search("foo", k=1)`
			`assert output == [Document(page_content="foo")]`

			`docsearch.persist()`

			`# Get a new VectorStore from the persisted directory`
			`docsearch = Chroma(`
			`collection_name=collection_name,`
			`embedding_function=FakeEmbeddings(),`
			`persist_directory=chroma_persist_dir,`
			`)`
			`output = docsearch.similarity_search("foo", k=1)`

			`# Clean up`
			`docsearch.delete_collection()`

			`# Persist doesn't need to be called again`
			`# Data will be automatically persisted on object deletion`
			`# Or on program exit`
Chroma fix mmr (#3897) Fixes #3628, thanks @derekmoeller for the issue! 2023-05-01 17:47:15 +00:00

			`def test_chroma_mmr() -> None:`
			`"""Test end to end construction and search."""`
			`texts = ["foo", "bar", "baz"]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()`
			`)`
			`output = docsearch.max_marginal_relevance_search("foo", k=1)`
			`assert output == [Document(page_content="foo")]`


			`def test_chroma_mmr_by_vector() -> None:`
			`"""Test end to end construction and search."""`
			`texts = ["foo", "bar", "baz"]`
			`embeddings = FakeEmbeddings()`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection", texts=texts, embedding=embeddings`
			`)`
			`embedded_query = embeddings.embed_query("foo")`
			`output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1)`
			`assert output == [Document(page_content="foo")]`
Specify which data to return from chromadb (#4393) # Improve the Chroma get() method by adding the optional "include" parameter. The Chroma get() method excludes embeddings by default. You can customize the response by specifying the "include" parameter to selectively retrieve the desired data from the collection. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com> 2023-05-16 21:43:09 +00:00

			`def test_chroma_with_include_parameter() -> None:`
			`"""Test end to end construction and include parameter."""`
			`texts = ["foo", "bar", "baz"]`
			`docsearch = Chroma.from_texts(`
			`collection_name="test_collection", texts=texts, embedding=FakeEmbeddings()`
			`)`
			`output = docsearch.get(include=["embeddings"])`
			`assert output["embeddings"] is not None`
			`output = docsearch.get()`
			`assert output["embeddings"] is None`
Fix update_document function, add test and documentation. (#5359) # Fix for `update_document` Function in Chroma ## Summary This pull request addresses an issue with the `update_document` function in the Chroma class, as described in [#5031](https://github.com/hwchase17/langchain/issues/5031#issuecomment-1562577947). The issue was identified as an `AttributeError` raised when calling `update_document` due to a missing corresponding method in the `Collection` object. This fix refactors the `update_document` method in `Chroma` to correctly interact with the `Collection` object. ## Changes 1. Fixed the `update_document` method in the `Chroma` class to correctly call methods on the `Collection` object. 2. Added the corresponding test `test_chroma_update_document` in `tests/integration_tests/vectorstores/test_chroma.py` to reflect the updated method call. 3. Added an example and explanation of how to use the `update_document` function in the Jupyter notebook tutorial for Chroma. ## Test Plan All existing tests pass after this change. In addition, the `test_chroma_update_document` test case now correctly checks the functionality of `update_document`, ensuring that the function works as expected and updates the content of documents correctly. ## Reviewers @dev2049 This fix will ensure that users are able to use the `update_document` function as expected, without encountering the previous `AttributeError`. This will enhance the usability and reliability of the Chroma class for all users. Thank you for considering this pull request. I look forward to your feedback and suggestions. 2023-05-29 13:39:25 +00:00

			`def test_chroma_update_document() -> None:`
			`"""Test the update_document function in the Chroma class."""`

			`# Initial document content and id`
			`initial_content = "foo"`
			`document_id = "doc1"`

			`# Create an instance of Document with initial content and metadata`
			`original_doc = Document(page_content=initial_content, metadata={"page": "0"})`

			`# Initialize a Chroma instance with the original document`
			`docsearch = Chroma.from_documents(`
			`collection_name="test_collection",`
			`documents=[original_doc],`
			`embedding=FakeEmbeddings(),`
			`ids=[document_id],`
			`)`

			`# Define updated content for the document`
			`updated_content = "updated foo"`

			`# Create a new Document instance with the updated content and the same id`
			`updated_doc = Document(page_content=updated_content, metadata={"page": "0"})`

			`# Update the document in the Chroma instance`
			`docsearch.update_document(document_id=document_id, document=updated_doc)`

			`# Perform a similarity search with the updated content`
			`output = docsearch.similarity_search(updated_content, k=1)`

			`# Assert that the updated document is returned by the search`
			`assert output == [Document(page_content=updated_content, metadata={"page": "0"})]`