From a1b9dfc099f7b468854aeee54dd5b0358e9bf25e Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 4 Mar 2023 08:10:15 -0800 Subject: [PATCH] Harrison/similarity search chroma (#1434) Co-authored-by: shibuiwilliam --- .../indexes/vectorstore_examples/chroma.ipynb | 40 +++++++++++++++++++ langchain/vectorstores/chroma.py | 31 ++++++++++++-- .../vectorstores/test_chroma.py | 14 +++++++ 3 files changed, 82 insertions(+), 3 deletions(-) diff --git a/docs/modules/indexes/vectorstore_examples/chroma.ipynb b/docs/modules/indexes/vectorstore_examples/chroma.ipynb index ce9610a490..2759548d4d 100644 --- a/docs/modules/indexes/vectorstore_examples/chroma.ipynb +++ b/docs/modules/indexes/vectorstore_examples/chroma.ipynb @@ -89,6 +89,46 @@ "print(docs[0].page_content)" ] }, + { + "cell_type": "markdown", + "id": "18152965", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72aaa9c8", + "metadata": {}, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d88e958e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \\n\\nWe cannot let this happen. \\n\\nTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', lookup_str='', metadata={'source': '../../state_of_the_union.txt'}, lookup_index=0),\n", + " 0.3913410007953644)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, { "cell_type": "markdown", "id": "8061454b", diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 8c1f39385e..7679b22eaf 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging import uuid -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Tuple from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings @@ -116,6 +116,27 @@ class Chroma(VectorStore): Returns: List[Document]: List of documents most simmilar to the query text. """ + docs_and_scores = self.similarity_search_with_score(query, k) + return [doc for doc, _ in docs_and_scores] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Run similarity search with Chroma with distance. + + Args: + query (str): Query text to search for. + k (int): Number of results to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Tuple[Document, float]]: List of documents most similar to the query + text with distance in float. + """ if self._embedding_function is None: results = self._collection.query( query_texts=[query], n_results=k, where=filter @@ -129,8 +150,12 @@ class Chroma(VectorStore): docs = [ # TODO: Chroma can do batch querying, # we shouldn't hard code to the 1st result - Document(page_content=result[0], metadata=result[1]) - for result in zip(results["documents"][0], results["metadatas"][0]) + (Document(page_content=result[0], metadata=result[1]), result[2]) + for result in zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0], + ) ] return docs diff --git a/tests/integration_tests/vectorstores/test_chroma.py b/tests/integration_tests/vectorstores/test_chroma.py index f1d488b51c..e558a0741d 100644 --- a/tests/integration_tests/vectorstores/test_chroma.py +++ b/tests/integration_tests/vectorstores/test_chroma.py @@ -28,6 +28,20 @@ def test_chroma_with_metadatas() -> None: assert output == [Document(page_content="foo", metadata={"page": "0"})] +def test_chroma_with_metadatas_with_scores() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = Chroma.from_texts( + collection_name="test_collection", + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] + + def test_chroma_with_persistence() -> None: """Test end to end construction and search, with persistence.""" chroma_persist_dir = "./tests/persist_dir"