fix chroma update_document to embed entire documents, fixes a characer-wise embedding bug (#5584)

# Chroma update_document full document embeddings bugfix

Chroma update_document takes a single document, but treats the
page_content sting of that document as a list when getting the new
document embedding.

This is a two-fold problem, where the resulting embedding for the
updated document is incorrect (it's only an embedding of the first
character in the new page_content) and it calls the embedding function
for every character in the new page_content string, using many tokens in
the process.

Fixes #5582


Co-authored-by: Caleb Ellington <calebellington@Calebs-MBP.hsd1.ca.comcast.net>
pull/5589/head^2
Caleb Ellington 1 year ago committed by GitHub
parent 3c6fa9126a
commit c5a7a85a4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -356,11 +356,11 @@ class Chroma(VectorStore):
raise ValueError(
"For update, you must specify an embedding function on creation."
)
embeddings = self._embedding_function.embed_documents(list(text))
embeddings = self._embedding_function.embed_documents([text])
self._collection.update(
ids=[document_id],
embeddings=[embeddings[0]],
embeddings=embeddings,
documents=[text],
metadatas=[metadata],
)

@ -3,7 +3,10 @@ import pytest
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
from tests.integration_tests.vectorstores.fake_embeddings import (
ConsistentFakeEmbeddings,
FakeEmbeddings,
)
def test_chroma() -> None:
@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None:
def test_chroma_update_document() -> None:
"""Test the update_document function in the Chroma class."""
# Make a consistent embedding
embedding = ConsistentFakeEmbeddings()
# Initial document content and id
initial_content = "foo"
@ -176,9 +181,12 @@ def test_chroma_update_document() -> None:
docsearch = Chroma.from_documents(
collection_name="test_collection",
documents=[original_doc],
embedding=FakeEmbeddings(),
embedding=embedding,
ids=[document_id],
)
old_embedding = docsearch._collection.peek()["embeddings"][
docsearch._collection.peek()["ids"].index(document_id)
]
# Define updated content for the document
updated_content = "updated foo"
@ -194,3 +202,10 @@ def test_chroma_update_document() -> None:
# Assert that the updated document is returned by the search
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
# Assert that the new embedding is correct
new_embedding = docsearch._collection.peek()["embeddings"][
docsearch._collection.peek()["ids"].index(document_id)
]
assert new_embedding == embedding.embed_documents([updated_content])[0]
assert new_embedding != old_embedding

Loading…
Cancel
Save