qdrant: search by vector (#6043)

Added support to `search_by_vector` to Qdrant Vector store.  ### Who can review VectorStores / Retrievers / Memory - @dev2049
2023-06-17 18:44:28 +02:00 · 2023-06-17 18:44:28 +02:00 · eef62bf4e9
commit eef62bf4e9
parent b7ba7e8a7b
2 changed files with 172 additions and 1 deletions
--- a/langchain/vectorstores/qdrant.py
+++ b/langchain/vectorstores/qdrant.py
@ -256,6 +256,118 @@ class Qdrant(VectorStore):
                             all of them
                - 'all' - query all replicas, and return values present in all replicas

+        Returns:
+            List of documents most similar to the query text and cosine
+            distance in float for each.
+            Lower score represents more similarity.
+        """
+        return self.similarity_search_with_score_by_vector(
+            self._embed_query(query),
+            k,
+            filter=filter,
+            search_params=search_params,
+            offset=offset,
+            score_threshold=score_threshold,
+            consistency=consistency,
+            **kwargs,
+        )
+
+    def similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to embedding vector.
+
+        Args:
+            embedding: Embedding vector to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                               majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                             all of them
+                - 'all' - query all replicas, and return values present in all replicas
+
+        Returns:
+            List of Documents most similar to the query.
+        """
+
+        results = self.similarity_search_with_score_by_vector(
+            embedding,
+            k,
+            filter=filter,
+            search_params=search_params,
+            offset=offset,
+            score_threshold=score_threshold,
+            consistency=consistency,
+            **kwargs,
+        )
+        return list(map(itemgetter(0), results))
+
+    def similarity_search_with_score_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to embedding vector.
+
+        Args:
+            embedding: Embedding vector to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                               majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                             all of them
+                - 'all' - query all replicas, and return values present in all replicas
+
        Returns:
            List of documents most similar to the query text and cosine
            distance in float for each.
@ -274,7 +386,7 @@ class Qdrant(VectorStore):
            qdrant_filter = filter
        results = self.client.search(
            collection_name=self.collection_name,
-            query_vector=self._embed_query(query),
+            query_vector=embedding,
            query_filter=qdrant_filter,
            search_params=search_params,
            limit=k,
--- a/tests/integration_tests/vectorstores/test_qdrant.py
+++ b/tests/integration_tests/vectorstores/test_qdrant.py
@ -40,6 +40,65 @@ def test_qdrant_similarity_search(
    assert output == [Document(page_content="foo")]


+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize(
+    ["content_payload_key", "metadata_payload_key"],
+    [
+        (Qdrant.CONTENT_KEY, Qdrant.METADATA_KEY),
+        ("foo", "bar"),
+        (Qdrant.CONTENT_KEY, "bar"),
+        ("foo", Qdrant.METADATA_KEY),
+    ],
+)
+def test_qdrant_similarity_search_by_vector(
+    batch_size: int, content_payload_key: str, metadata_payload_key: str
+) -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Qdrant.from_texts(
+        texts,
+        ConsistentFakeEmbeddings(),
+        location=":memory:",
+        content_payload_key=content_payload_key,
+        metadata_payload_key=metadata_payload_key,
+        batch_size=batch_size,
+    )
+    embeddings = ConsistentFakeEmbeddings().embed_query("foo")
+    output = docsearch.similarity_search_by_vector(embeddings, k=1)
+    assert output == [Document(page_content="foo")]
+
+
+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize(
+    ["content_payload_key", "metadata_payload_key"],
+    [
+        (Qdrant.CONTENT_KEY, Qdrant.METADATA_KEY),
+        ("foo", "bar"),
+        (Qdrant.CONTENT_KEY, "bar"),
+        ("foo", Qdrant.METADATA_KEY),
+    ],
+)
+def test_qdrant_similarity_search_with_score_by_vector(
+    batch_size: int, content_payload_key: str, metadata_payload_key: str
+) -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = Qdrant.from_texts(
+        texts,
+        ConsistentFakeEmbeddings(),
+        location=":memory:",
+        content_payload_key=content_payload_key,
+        metadata_payload_key=metadata_payload_key,
+        batch_size=batch_size,
+    )
+    embeddings = ConsistentFakeEmbeddings().embed_query("foo")
+    output = docsearch.similarity_search_with_score_by_vector(embeddings, k=1)
+    assert len(output) == 1
+    document, score = output[0]
+    assert document == Document(page_content="foo")
+    assert score >= 0
+
+
@pytest.mark.parametrize("batch_size", [1, 64])
 def test_qdrant_add_documents(batch_size: int) -> None:
    """Test end to end construction and search."""