diff --git a/docs/modules/indexes/vectorstores/examples/chroma.ipynb b/docs/modules/indexes/vectorstores/examples/chroma.ipynb index 5c24f2a0..70416c02 100644 --- a/docs/modules/indexes/vectorstores/examples/chroma.ipynb +++ b/docs/modules/indexes/vectorstores/examples/chroma.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -33,7 +34,7 @@ }, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ " ········\n" @@ -86,7 +87,6 @@ }, "outputs": [], "source": [ - "from langchain.document_loaders import TextLoader\n", "loader = TextLoader('../../../state_of_the_union.txt')\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", @@ -143,6 +143,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "18152965", "metadata": {}, @@ -187,6 +188,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8061454b", "metadata": {}, @@ -197,6 +199,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2b76db26", "metadata": {}, @@ -232,6 +235,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f568a322", "metadata": {}, @@ -262,6 +266,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cc9ed900", "metadata": {}, @@ -292,6 +297,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "794a7552", "metadata": {}, @@ -336,13 +342,81 @@ "retriever.get_relevant_documents(query)[0]" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2a877f08", + "metadata": {}, + "source": [ + "## Updating a Document\n", + "The `update_document` function allows you to modify the content of a document in the Chroma instance after it has been added. Let's see an example of how to use this function." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "a559c3f1", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Import Document class\n", + "from langchain.docstore.document import Document\n", + "\n", + "# Initial document content and id\n", + "initial_content = \"This is an initial document content\"\n", + "document_id = \"doc1\"\n", + "\n", + "# Create an instance of Document with initial content and metadata\n", + "original_doc = Document(page_content=initial_content, metadata={\"page\": \"0\"})\n", + "\n", + "# Initialize a Chroma instance with the original document\n", + "new_db = Chroma.from_documents(\n", + " collection_name=\"test_collection\",\n", + " documents=[original_doc],\n", + " embedding=OpenAIEmbeddings(), # using the same embeddings as before\n", + " ids=[document_id],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "60a7c273", + "metadata": {}, + "source": [ + "At this point, we have a new Chroma instance with a single document \"This is an initial document content\" with id \"doc1\". Now, let's update the content of the document." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "55e48056", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is the updated document content {'page': '1'}\n" + ] + } + ], + "source": [ + "# Updated document content\n", + "updated_content = \"This is the updated document content\"\n", + "\n", + "# Create a new Document instance with the updated content\n", + "updated_doc = Document(page_content=updated_content, metadata={\"page\": \"1\"})\n", + "\n", + "# Update the document in the Chroma instance by passing the document id and the updated document\n", + "new_db.update_document(document_id=document_id, document=updated_doc)\n", + "\n", + "# Now, let's retrieve the updated document using similarity search\n", + "output = new_db.similarity_search(updated_content, k=1)\n", + "\n", + "# Print the content of the retrieved document\n", + "print(output[0].page_content, output[0].metadata)" + ] } ], "metadata": { diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index f83c8c22..b10a81ed 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -348,7 +348,18 @@ class Chroma(VectorStore): """ text = document.page_content metadata = document.metadata - self._collection.update_document(document_id, text, metadata) + if self._embedding_function is None: + raise ValueError( + "For update, you must specify an embedding function on creation." + ) + embeddings = self._embedding_function.embed_documents(list(text)) + + self._collection.update( + ids=[document_id], + embeddings=[embeddings[0]], + documents=[text], + metadatas=[metadata], + ) @classmethod def from_texts( diff --git a/tests/integration_tests/vectorstores/test_chroma.py b/tests/integration_tests/vectorstores/test_chroma.py index 17fb781e..cc594d2e 100644 --- a/tests/integration_tests/vectorstores/test_chroma.py +++ b/tests/integration_tests/vectorstores/test_chroma.py @@ -160,3 +160,37 @@ def test_chroma_with_include_parameter() -> None: assert output["embeddings"] is not None output = docsearch.get() assert output["embeddings"] is None + + +def test_chroma_update_document() -> None: + """Test the update_document function in the Chroma class.""" + + # Initial document content and id + initial_content = "foo" + document_id = "doc1" + + # Create an instance of Document with initial content and metadata + original_doc = Document(page_content=initial_content, metadata={"page": "0"}) + + # Initialize a Chroma instance with the original document + docsearch = Chroma.from_documents( + collection_name="test_collection", + documents=[original_doc], + embedding=FakeEmbeddings(), + ids=[document_id], + ) + + # Define updated content for the document + updated_content = "updated foo" + + # Create a new Document instance with the updated content and the same id + updated_doc = Document(page_content=updated_content, metadata={"page": "0"}) + + # Update the document in the Chroma instance + docsearch.update_document(document_id=document_id, document=updated_doc) + + # Perform a similarity search with the updated content + output = docsearch.similarity_search(updated_content, k=1) + + # Assert that the updated document is returned by the search + assert output == [Document(page_content=updated_content, metadata={"page": "0"})]