From f907b625262c1236de68ae524d1feac406348749 Mon Sep 17 00:00:00 2001 From: berkedilekoglu Date: Tue, 6 Jun 2023 06:39:49 +0300 Subject: [PATCH] Scores are explained in vectorestore docs (#5613) # Scores in Vectorestores' Docs Are Explained Following vectorestores can return scores with similar documents by using `similarity_search_with_score`: - chroma - docarray_hnsw - docarray_in_memory - faiss - myscale - qdrant - supabase - vectara - weaviate However, in documents, these scores were either not explained at all or explained in a way that could lead to misunderstandings (e.g., FAISS). For instance in FAISS document: if we consider the score returned by the function as a similarity score, we understand that a document returning a higher score is more similar to the source document. However, since the scores returned by the function are distance scores, we should understand that smaller scores correspond to more similar documents. For the libraries other than Vectara, I wrote the scores they use by investigating from the source libraries. Since I couldn't be certain about the score metric used by Vectara, I didn't make any changes in its documentation. The links mentioned in Vectara's documentation became broken due to updates, so I replaced them with working ones. VectorStores / Retrievers / Memory - @dev2049 my twitter: [berkedilekoglu](https://twitter.com/berkedilekoglu) --------- Co-authored-by: Harrison Chase --- .../vectorstores/examples/chroma.ipynb | 9 +++++++ .../vectorstores/examples/docarray_hnsw.ipynb | 14 +++++++++++ .../examples/docarray_in_memory.ipynb | 14 +++++++++++ .../indexes/vectorstores/examples/faiss.ipynb | 10 ++++++-- .../vectorstores/examples/myscale.ipynb | 25 +++++++++++++++++++ .../vectorstores/examples/qdrant.ipynb | 20 +++++++++++++-- .../vectorstores/examples/supabase.ipynb | 14 +++++++++++ .../vectorstores/examples/vectara.ipynb | 10 ++++++-- .../vectorstores/examples/weaviate.ipynb | 14 +++++++++++ langchain/vectorstores/chroma.py | 5 ++-- langchain/vectorstores/docarray/base.py | 4 ++- langchain/vectorstores/faiss.py | 6 +++-- langchain/vectorstores/myscale.py | 4 ++- langchain/vectorstores/qdrant.py | 4 ++- langchain/vectorstores/weaviate.py | 5 ++++ 15 files changed, 145 insertions(+), 13 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/chroma.ipynb b/docs/modules/indexes/vectorstores/examples/chroma.ipynb index 70416c02..d0e09157 100644 --- a/docs/modules/indexes/vectorstores/examples/chroma.ipynb +++ b/docs/modules/indexes/vectorstores/examples/chroma.ipynb @@ -151,6 +151,15 @@ "## Similarity search with score" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "346347d7", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 10, diff --git a/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb index 94f6b952..8221d133 100644 --- a/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb +++ b/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "2ce41f46-5711-4311-b04d-2fe233ac5b1b", "metadata": {}, @@ -13,6 +14,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7ee37d28", "metadata": {}, @@ -55,6 +57,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "8dbb6de2", "metadata": { @@ -98,6 +101,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "ed6f905b-4853-4a44-9730-614aa8e22b78", "metadata": {}, @@ -145,6 +149,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "3febb987-e903-416f-af26-6897d84c8d61", "metadata": {}, @@ -152,6 +157,15 @@ "### Similarity search with score" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bb1df11a", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 7, diff --git a/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb index 306439ea..12919c3b 100644 --- a/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb +++ b/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "a3afefb0-7e99-4912-a222-c6b186da11af", "metadata": {}, @@ -13,6 +14,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "5031a3ec", "metadata": {}, @@ -54,6 +56,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6e57a389-f637-4b8f-9ab2-759ae7485f78", "metadata": {}, @@ -95,6 +98,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "efbb6684-3846-4332-a624-ddd4d75844c1", "metadata": {}, @@ -142,6 +146,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "43896697-f99e-47b6-9117-47a25e9afa9c", "metadata": {}, @@ -149,6 +154,15 @@ "### Similarity search with score" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "414a9bc9", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 7, diff --git a/docs/modules/indexes/vectorstores/examples/faiss.ipynb b/docs/modules/indexes/vectorstores/examples/faiss.ipynb index 78000c2c..d967068e 100644 --- a/docs/modules/indexes/vectorstores/examples/faiss.ipynb +++ b/docs/modules/indexes/vectorstores/examples/faiss.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -29,6 +30,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "38237514-b3fa-44a4-9cff-30cd6bf50073", "metadata": {}, @@ -45,7 +47,7 @@ }, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "OpenAI API Key: ········\n" @@ -137,12 +139,13 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f13473b5", "metadata": {}, "source": [ "## Similarity Search with score\n", - "There are some FAISS specific methods. One of them is `similarity_search_with_score`, which allows you to return not only the documents but also the similarity score of the query to them." + "There are some FAISS specific methods. One of them is `similarity_search_with_score`, which allows you to return not only the documents but also the distance score of the query to them. The returned distance score is L2 distance. Therefore, a lower score is better." ] }, { @@ -178,6 +181,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f34420cf", "metadata": {}, @@ -197,6 +201,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "31bda7fd", "metadata": {}, @@ -257,6 +262,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "57da60d4", "metadata": {}, diff --git a/docs/modules/indexes/vectorstores/examples/myscale.ipynb b/docs/modules/indexes/vectorstores/examples/myscale.ipynb index 12505987..15ae2062 100644 --- a/docs/modules/indexes/vectorstores/examples/myscale.ipynb +++ b/docs/modules/indexes/vectorstores/examples/myscale.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -13,6 +14,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "43ead5d5-2c1f-4dce-a69a-cb00e4f9d6f0", "metadata": {}, @@ -33,6 +35,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "15a1d477-9cdb-4d82-b019-96951ecb2b72", "metadata": {}, @@ -54,6 +57,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a9d16fa3", "metadata": {}, @@ -169,6 +173,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "e3a8b105", "metadata": {}, @@ -187,6 +192,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "f59360c0", "metadata": {}, @@ -231,6 +237,24 @@ "docsearch = MyScale.from_documents(docs, embeddings)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8d867b05", + "metadata": {}, + "source": [ + "### Similarity search with score" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9ec25cc5", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 16, @@ -257,6 +281,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a359ed74", "metadata": {}, diff --git a/docs/modules/indexes/vectorstores/examples/qdrant.ipynb b/docs/modules/indexes/vectorstores/examples/qdrant.ipynb index 526d08a9..c53c46e9 100644 --- a/docs/modules/indexes/vectorstores/examples/qdrant.ipynb +++ b/docs/modules/indexes/vectorstores/examples/qdrant.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -33,6 +34,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7b2f111b-357a-4f42-9730-ef0603bdc1b5", "metadata": {}, @@ -49,7 +51,7 @@ }, "outputs": [ { - "name": "stdin", + "name": "stdout", "output_type": "stream", "text": [ "OpenAI API Key: ········\n" @@ -104,6 +106,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "eeead681", "metadata": {}, @@ -140,6 +143,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "59f0b954", "metadata": {}, @@ -170,6 +174,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "749658ce", "metadata": {}, @@ -200,6 +205,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c9e21ce9", "metadata": {}, @@ -231,6 +237,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "93540013", "metadata": {}, @@ -279,6 +286,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1f9215c8", "metadata": { @@ -341,13 +349,15 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1bda9bf5", "metadata": {}, "source": [ "## Similarity search with score\n", "\n", - "Sometimes we might want to perform the search, but also obtain a relevancy score to know how good is a particular result." + "Sometimes we might want to perform the search, but also obtain a relevancy score to know how good is a particular result. \n", + "The returned distance score is cosine distance. Therefore, a lower score is better." ] }, { @@ -400,6 +410,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "525e3582", "metadata": {}, @@ -410,6 +421,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1c2c58dc", "metadata": {}, @@ -423,6 +435,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "c58c30bf", "metadata": { @@ -503,6 +516,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "691a82d6", "metadata": {}, @@ -540,6 +554,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0c851b4f", "metadata": {}, @@ -602,6 +617,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "0358ecde", "metadata": {}, diff --git a/docs/modules/indexes/vectorstores/examples/supabase.ipynb b/docs/modules/indexes/vectorstores/examples/supabase.ipynb index 5cb264ff..b653df69 100644 --- a/docs/modules/indexes/vectorstores/examples/supabase.ipynb +++ b/docs/modules/indexes/vectorstores/examples/supabase.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -9,6 +10,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "cc80fa84-1f2f-48b4-bd39-3e6412f012f1", "metadata": {}, @@ -85,6 +87,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "69bff365-3039-4ff8-a641-aa190166179d", "metadata": {}, @@ -236,6 +239,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "18152965", "metadata": {}, @@ -243,6 +247,15 @@ "## Similarity search with score\n" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ea13e80a", + "metadata": {}, + "source": [ + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 9, @@ -276,6 +289,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "794a7552", "metadata": {}, diff --git a/docs/modules/indexes/vectorstores/examples/vectara.ipynb b/docs/modules/indexes/vectorstores/examples/vectara.ipynb index 5b2dad08..6551f55c 100644 --- a/docs/modules/indexes/vectorstores/examples/vectara.ipynb +++ b/docs/modules/indexes/vectorstores/examples/vectara.ipynb @@ -1,21 +1,23 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, "source": [ "# Vectara\n", "\n", - ">[Vectara](https://Vectara.com/docs/) is a API platform for building LLM-powered applications. It provides a simple to use API for document indexing and query that is managed by Vectara and is optimized for performance and accuracy. \n", + ">[Vectara](https://vectara.com/) is a API platform for building LLM-powered applications. It provides a simple to use API for document indexing and query that is managed by Vectara and is optimized for performance and accuracy. \n", "\n", "\n", "This notebook shows how to use functionality related to the `Vectara` vector database. \n", "\n", - "See the [Vectara API documentation ](https://Vectara.com/docs/) for more information on how to use the API." + "See the [Vectara API documentation ](https://docs.vectara.com/docs/) for more information on how to use the API." ] }, { + "attachments": {}, "cell_type": "markdown", "id": "7b2f111b-357a-4f42-9730-ef0603bdc1b5", "metadata": {}, @@ -87,6 +89,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "eeead681", "metadata": {}, @@ -113,6 +116,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1f9215c8", "metadata": { @@ -169,6 +173,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1bda9bf5", "metadata": {}, @@ -222,6 +227,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "691a82d6", "metadata": {}, diff --git a/docs/modules/indexes/vectorstores/examples/weaviate.ipynb b/docs/modules/indexes/vectorstores/examples/weaviate.ipynb index e2494e38..2b151716 100644 --- a/docs/modules/indexes/vectorstores/examples/weaviate.ipynb +++ b/docs/modules/indexes/vectorstores/examples/weaviate.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, @@ -47,6 +48,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "6b34828d-e627-4d85-aabd-eeb15d9f4b00", "metadata": {}, @@ -165,6 +167,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "a15863ee", "metadata": {}, @@ -172,6 +175,16 @@ "## Similarity search with score" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "64e03db8", + "metadata": {}, + "source": [ + "Sometimes we might want to perform the search, but also obtain a relevancy score to know how good is a particular result. \n", + "The returned distance score is cosine distance. Therefore, a lower score is better." + ] + }, { "cell_type": "code", "execution_count": 10, @@ -214,6 +227,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "05fd146c", "metadata": {}, diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 3c8e8239..6b9be25e 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -217,8 +217,9 @@ class Chroma(VectorStore): filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: - List[Tuple[Document, float]]: List of documents most similar to the query - text with distance in float. + List[Tuple[Document, float]]: List of documents most similar to + the query text and cosine distance in float for each. + Lower score represents more similarity. """ if self._embedding_function is None: results = self.__query_collection( diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index d8b0f4b5..18b4a01a 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -96,7 +96,9 @@ class DocArrayIndex(VectorStore, ABC): k: Number of Documents to return. Defaults to 4. Returns: - List of Documents most similar to the query and score for each. + List of documents most similar to the query text and + cosine distance in float for each. + Lower score represents more similarity. """ query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) # type: ignore diff --git a/langchain/vectorstores/faiss.py b/langchain/vectorstores/faiss.py index 26266e04..42f9cf1b 100644 --- a/langchain/vectorstores/faiss.py +++ b/langchain/vectorstores/faiss.py @@ -189,7 +189,8 @@ class FAISS(VectorStore): k: Number of Documents to return. Defaults to 4. Returns: - List of Documents most similar to the query and score for each + List of documents most similar to the query text and L2 distance + in float for each. Lower score represents more similarity. """ faiss = dependable_faiss_import() vector = np.array([embedding], dtype=np.float32) @@ -218,7 +219,8 @@ class FAISS(VectorStore): k: Number of Documents to return. Defaults to 4. Returns: - List of Documents most similar to the query and score for each + List of documents most similar to the query text with + L2 distance in float. Lower score represents more similarity. """ embedding = self.embedding_function(query) docs = self.similarity_search_with_score_by_vector(embedding, k) diff --git a/langchain/vectorstores/myscale.py b/langchain/vectorstores/myscale.py index cc43aa7e..fbea41ae 100644 --- a/langchain/vectorstores/myscale.py +++ b/langchain/vectorstores/myscale.py @@ -404,7 +404,9 @@ class MyScale(VectorStore): alone. The default name for it is `metadata`. Returns: - List[Document]: List of documents + List[Document]: List of documents most similar to the query text + and cosine distance in float for each. + Lower score represents more similarity. """ q_str = self._build_qstr(self.embedding_function(query), k, where_str) try: diff --git a/langchain/vectorstores/qdrant.py b/langchain/vectorstores/qdrant.py index 9c5f2f7a..f1f16137 100644 --- a/langchain/vectorstores/qdrant.py +++ b/langchain/vectorstores/qdrant.py @@ -192,7 +192,9 @@ class Qdrant(VectorStore): filter: Filter by metadata. Defaults to None. Returns: - List of Documents most similar to the query and score for each. + List of documents most similar to the query text and cosine + distance in float for each. + Lower score represents more similarity. """ if filter is not None and isinstance(filter, dict): diff --git a/langchain/vectorstores/weaviate.py b/langchain/vectorstores/weaviate.py index 1ff98f3b..43501452 100644 --- a/langchain/vectorstores/weaviate.py +++ b/langchain/vectorstores/weaviate.py @@ -314,6 +314,11 @@ class Weaviate(VectorStore): def similarity_search_with_score( self, query: str, k: int = 4, **kwargs: Any ) -> List[Tuple[Document, float]]: + """ + Return list of documents most similar to the query + text and cosine distance in float for each. + Lower score represents more similarity. + """ if self._embedding is None: raise ValueError( "_embedding cannot be None for similarity_search_with_score"