From eef62bf4e9d5b5115f5480d2356ab6341b55311e Mon Sep 17 00:00:00 2001 From: Slawomir Gonet Date: Sat, 17 Jun 2023 18:44:28 +0200 Subject: [PATCH] qdrant: search by vector (#6043) Added support to `search_by_vector` to Qdrant Vector store. ### Who can review VectorStores / Retrievers / Memory - @dev2049 --- langchain/vectorstores/qdrant.py | 114 +++++++++++++++++- .../vectorstores/test_qdrant.py | 59 +++++++++ 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/langchain/vectorstores/qdrant.py b/langchain/vectorstores/qdrant.py index 9acc99fa..7a7061fe 100644 --- a/langchain/vectorstores/qdrant.py +++ b/langchain/vectorstores/qdrant.py @@ -256,6 +256,118 @@ class Qdrant(VectorStore): all of them - 'all' - query all replicas, and return values present in all replicas + Returns: + List of documents most similar to the query text and cosine + distance in float for each. + Lower score represents more similarity. + """ + return self.similarity_search_with_score_by_vector( + self._embed_query(query), + k, + filter=filter, + search_params=search_params, + offset=offset, + score_threshold=score_threshold, + consistency=consistency, + **kwargs, + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[MetadataFilter] = None, + search_params: Optional[common_types.SearchParams] = None, + offset: int = 0, + score_threshold: Optional[float] = None, + consistency: Optional[common_types.ReadConsistency] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + search_params: Additional search params + offset: + Offset of the first result to return. + May be used to paginate results. + Note: large offset values may cause performance issues. + score_threshold: + Define a minimal score threshold for the result. + If defined, less similar results will not be returned. + Score of the returned result might be higher or smaller than the + threshold depending on the Distance function used. + E.g. for cosine similarity only higher scores will be returned. + consistency: + Read consistency of the search. Defines how many replicas should be + queried before returning the result. + Values: + - int - number of replicas to query, values should present in all + queried replicas + - 'majority' - query all replicas, but return values present in the + majority of replicas + - 'quorum' - query the majority of replicas, return values present in + all of them + - 'all' - query all replicas, and return values present in all replicas + + Returns: + List of Documents most similar to the query. + """ + + results = self.similarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + search_params=search_params, + offset=offset, + score_threshold=score_threshold, + consistency=consistency, + **kwargs, + ) + return list(map(itemgetter(0), results)) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[MetadataFilter] = None, + search_params: Optional[common_types.SearchParams] = None, + offset: int = 0, + score_threshold: Optional[float] = None, + consistency: Optional[common_types.ReadConsistency] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Filter by metadata. Defaults to None. + search_params: Additional search params + offset: + Offset of the first result to return. + May be used to paginate results. + Note: large offset values may cause performance issues. + score_threshold: + Define a minimal score threshold for the result. + If defined, less similar results will not be returned. + Score of the returned result might be higher or smaller than the + threshold depending on the Distance function used. + E.g. for cosine similarity only higher scores will be returned. + consistency: + Read consistency of the search. Defines how many replicas should be + queried before returning the result. + Values: + - int - number of replicas to query, values should present in all + queried replicas + - 'majority' - query all replicas, but return values present in the + majority of replicas + - 'quorum' - query the majority of replicas, return values present in + all of them + - 'all' - query all replicas, and return values present in all replicas + Returns: List of documents most similar to the query text and cosine distance in float for each. @@ -274,7 +386,7 @@ class Qdrant(VectorStore): qdrant_filter = filter results = self.client.search( collection_name=self.collection_name, - query_vector=self._embed_query(query), + query_vector=embedding, query_filter=qdrant_filter, search_params=search_params, limit=k, diff --git a/tests/integration_tests/vectorstores/test_qdrant.py b/tests/integration_tests/vectorstores/test_qdrant.py index aec77cd0..1211f865 100644 --- a/tests/integration_tests/vectorstores/test_qdrant.py +++ b/tests/integration_tests/vectorstores/test_qdrant.py @@ -40,6 +40,65 @@ def test_qdrant_similarity_search( assert output == [Document(page_content="foo")] +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize( + ["content_payload_key", "metadata_payload_key"], + [ + (Qdrant.CONTENT_KEY, Qdrant.METADATA_KEY), + ("foo", "bar"), + (Qdrant.CONTENT_KEY, "bar"), + ("foo", Qdrant.METADATA_KEY), + ], +) +def test_qdrant_similarity_search_by_vector( + batch_size: int, content_payload_key: str, metadata_payload_key: str +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = docsearch.similarity_search_by_vector(embeddings, k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.parametrize("batch_size", [1, 64]) +@pytest.mark.parametrize( + ["content_payload_key", "metadata_payload_key"], + [ + (Qdrant.CONTENT_KEY, Qdrant.METADATA_KEY), + ("foo", "bar"), + (Qdrant.CONTENT_KEY, "bar"), + ("foo", Qdrant.METADATA_KEY), + ], +) +def test_qdrant_similarity_search_with_score_by_vector( + batch_size: int, content_payload_key: str, metadata_payload_key: str +) -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = Qdrant.from_texts( + texts, + ConsistentFakeEmbeddings(), + location=":memory:", + content_payload_key=content_payload_key, + metadata_payload_key=metadata_payload_key, + batch_size=batch_size, + ) + embeddings = ConsistentFakeEmbeddings().embed_query("foo") + output = docsearch.similarity_search_with_score_by_vector(embeddings, k=1) + assert len(output) == 1 + document, score = output[0] + assert document == Document(page_content="foo") + assert score >= 0 + + @pytest.mark.parametrize("batch_size", [1, 64]) def test_qdrant_add_documents(batch_size: int) -> None: """Test end to end construction and search."""