[Matching Engine] Update the Matching Engine to include the distance and filters (#12555)

Hello 👋, This Pull Request adds more capability to the [MatchingEngine](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.matching_engine.MatchingEngine.html) vectorstore of GCP. It includes the `similarity_search_by_vector_with_relevance_scores` function and also [filters](https://cloud.google.com/vertex-ai/docs/vector-search/filtering) to `filter` the namespaces when retrieving the results. - **Description:** Add [filter](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.MatchingEngineIndexEndpoint#google_cloud_aiplatform_MatchingEngineIndexEndpoint_find_neighbors) in `similarity_search` and add `similarity_search_by_vector_with_relevance_scores` method - **Dependencies:** None - **Tag maintainer:** Unknown Thank you! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
9 months ago · bfb27324cb
parent 3c5c384f1a
commit bfb27324cb
1 changed files with 120 additions and 15 deletions
--- a/libs/langchain/langchain/vectorstores/matching_engine.py
+++ b/libs/langchain/langchain/vectorstores/matching_engine.py
@ -4,7 +4,7 @@ import json
 import logging
 import time
 import uuid
-from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type
+from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type

 from langchain.schema.document import Document
 from langchain.schema.embeddings import Embeddings
@ -14,6 +14,9 @@ from langchain.utilities.vertexai import get_client_info
 if TYPE_CHECKING:
    from google.cloud import storage
    from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
+    from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
+        Namespace,
+    )
    from google.oauth2.service_account import Credentials

    from langchain.embeddings import TensorflowHubEmbeddings
@ -169,41 +172,85 @@ class MatchingEngine(VectorStore):
        blob = bucket.blob(gcs_location)
        blob.upload_from_string(data)

-    def similarity_search(
-        self, query: str, k: int = 4, **kwargs: Any
-    ) -> List[Document]:
-        """Return docs most similar to query.
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 4,
+        filter: Optional[List[Namespace]] = None,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query and their cosine distance from the query.

        Args:
-            query: The string that will be used to search for similar documents.
-            k: The amount of neighbors that will be retrieved.
+            query: String query look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            filter: Optional. A list of Namespaces for filtering
+                the matching results.
+                For example:
+                [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
+                will match datapoints that satisfy "red color" but not include
+                datapoints with "squared shape". Please refer to
+                https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
+                for more detail.

        Returns:
-            A list of k matching documents.
+            List[Tuple[Document, float]]: List of documents most similar to
+            the query text and cosine distance in float for each.
+            Lower score represents more similarity.
        """
-
        logger.debug(f"Embedding query {query}.")
-        embedding_query = self.embedding.embed_documents([query])
+        embedding_query = self.embedding.embed_query(query)
+        return self.similarity_search_by_vector_with_score(
+            embedding_query, k=k, filter=filter
+        )
+
+    def similarity_search_by_vector_with_score(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[List[Namespace]] = None,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to the embedding and their cosine distance.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            filter: Optional. A list of Namespaces for filtering
+                the matching results.
+                For example:
+                [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
+                will match datapoints that satisfy "red color" but not include
+                datapoints with "squared shape". Please refer to
+                https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
+                for more detail.
+
+        Returns:
+            List[Tuple[Document, float]]: List of documents most similar to
+            the query text and cosine distance in float for each.
+            Lower score represents more similarity.
+        """
+        filter = filter or []

        # If the endpoint is public we use the find_neighbors function.
        if self.endpoint._public_match_client:
            response = self.endpoint.find_neighbors(
                deployed_index_id=self._get_index_id(),
-                queries=embedding_query,
+                queries=[embedding],
                num_neighbors=k,
+                filter=filter,
            )
        else:
            response = self.endpoint.match(
                deployed_index_id=self._get_index_id(),
-                queries=embedding_query,
+                queries=[embedding],
                num_neighbors=k,
+                filter=filter,
            )

+        logger.debug(f"Found {len(response)} matches.")
+
        if len(response) == 0:
            return []

-        logger.debug(f"Found {len(response)} matches for the query {query}.")
-
        results = []

        # I'm only getting the first one because queries receives an array
@ -212,12 +259,70 @@ class MatchingEngine(VectorStore):
        # one element.
        for doc in response[0]:
            page_content = self._download_from_gcs(f"documents/{doc.id}")
-            results.append(Document(page_content=page_content))
+            results.append((Document(page_content=page_content), doc.distance))

        logger.debug("Downloaded documents for query.")

        return results

+    def similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        filter: Optional[List[Namespace]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to query.
+
+        Args:
+            query: The string that will be used to search for similar documents.
+            k: The amount of neighbors that will be retrieved.
+            filter: Optional. A list of Namespaces for filtering the matching results.
+                For example:
+                [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
+                will match datapoints that satisfy "red color" but not include
+                datapoints with "squared shape". Please refer to
+                https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
+                 for more detail.
+
+        Returns:
+            A list of k matching documents.
+        """
+        docs_and_scores = self.similarity_search_with_score(
+            query, k=k, filter=filter, **kwargs
+        )
+
+        return [doc for doc, _ in docs_and_scores]
+
+    def similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        filter: Optional[List[Namespace]] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to the embedding.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: The amount of neighbors that will be retrieved.
+            filter: Optional. A list of Namespaces for filtering the matching results.
+                For example:
+                [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
+                will match datapoints that satisfy "red color" but not include
+                datapoints with "squared shape". Please refer to
+                https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
+                 for more detail.
+
+        Returns:
+            A list of k matching documents.
+        """
+        docs_and_scores = self.similarity_search_by_vector_with_score(
+            embedding, k=k, filter=filter, **kwargs
+        )
+
+        return [doc for doc, _ in docs_and_scores]
+
    def _get_index_id(self) -> str:
        """Gets the correct index id for the endpoint.