fix chroma update_document to embed entire documents, fixes a characer-wise embedding bug (#5584)

# Chroma update_document full document embeddings bugfix Chroma update_document takes a single document, but treats the page_content sting of that document as a list when getting the new document embedding. This is a two-fold problem, where the resulting embedding for the updated document is incorrect (it's only an embedding of the first character in the new page_content) and it calls the embedding function for every character in the new page_content string, using many tokens in the process. Fixes #5582 Co-authored-by: Caleb Ellington <calebellington@Calebs-MBP.hsd1.ca.comcast.net>
1 year ago · c5a7a85a4e
parent 3c6fa9126a
commit c5a7a85a4e
2 changed files with 19 additions and 4 deletions
--- a/langchain/vectorstores/chroma.py
+++ b/langchain/vectorstores/chroma.py
@ -356,11 +356,11 @@ class Chroma(VectorStore):
            raise ValueError(
                "For update, you must specify an embedding function on creation."
            )
-        embeddings = self._embedding_function.embed_documents(list(text))
+        embeddings = self._embedding_function.embed_documents([text])

        self._collection.update(
            ids=[document_id],
-            embeddings=[embeddings[0]],
+            embeddings=embeddings,
            documents=[text],
            metadatas=[metadata],
        )
--- a/tests/integration_tests/vectorstores/test_chroma.py
+++ b/tests/integration_tests/vectorstores/test_chroma.py
@ -3,7 +3,10 @@ import pytest

 from langchain.docstore.document import Document
 from langchain.vectorstores import Chroma
-from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    ConsistentFakeEmbeddings,
+    FakeEmbeddings,
+)


 def test_chroma() -> None:
@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None:

 def test_chroma_update_document() -> None:
    """Test the update_document function in the Chroma class."""
+    # Make a consistent embedding
+    embedding = ConsistentFakeEmbeddings()

    # Initial document content and id
    initial_content = "foo"
@ -176,9 +181,12 @@ def test_chroma_update_document() -> None:
    docsearch = Chroma.from_documents(
        collection_name="test_collection",
        documents=[original_doc],
-        embedding=FakeEmbeddings(),
+        embedding=embedding,
        ids=[document_id],
    )
+    old_embedding = docsearch._collection.peek()["embeddings"][
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]

    # Define updated content for the document
    updated_content = "updated foo"
@ -194,3 +202,10 @@ def test_chroma_update_document() -> None:

    # Assert that the updated document is returned by the search
    assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
+
+    # Assert that the new embedding is correct
+    new_embedding = docsearch._collection.peek()["embeddings"][
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]
+    assert new_embedding == embedding.embed_documents([updated_content])[0]
+    assert new_embedding != old_embedding