[Matching Engine] Update the Matching Engine to include the distance and filters (#12555)

Hello 👋,

This Pull Request adds more capability to the
[MatchingEngine](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.matching_engine.MatchingEngine.html)
vectorstore of GCP. It includes the
`similarity_search_by_vector_with_relevance_scores` function and also
[filters](https://cloud.google.com/vertex-ai/docs/vector-search/filtering)
to `filter` the namespaces when retrieving the results.

- **Description:** Add
[filter](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.MatchingEngineIndexEndpoint#google_cloud_aiplatform_MatchingEngineIndexEndpoint_find_neighbors)
in `similarity_search` and add
`similarity_search_by_vector_with_relevance_scores` method
  - **Dependencies:** None
  - **Tag maintainer:** Unknown

Thank you!

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/12537/head^2
Florian Valeye 9 months ago committed by GitHub
parent 3c5c384f1a
commit bfb27324cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -4,7 +4,7 @@ import json
import logging
import time
import uuid
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
from langchain.schema.document import Document
from langchain.schema.embeddings import Embeddings
@ -14,6 +14,9 @@ from langchain.utilities.vertexai import get_client_info
if TYPE_CHECKING:
from google.cloud import storage
from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
Namespace,
)
from google.oauth2.service_account import Credentials
from langchain.embeddings import TensorflowHubEmbeddings
@ -169,41 +172,85 @@ class MatchingEngine(VectorStore):
blob = bucket.blob(gcs_location)
blob.upload_from_string(data)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to query.
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[List[Namespace]] = None,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query and their cosine distance from the query.
Args:
query: The string that will be used to search for similar documents.
k: The amount of neighbors that will be retrieved.
query: String query look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional. A list of Namespaces for filtering
the matching results.
For example:
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
will match datapoints that satisfy "red color" but not include
datapoints with "squared shape". Please refer to
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
for more detail.
Returns:
A list of k matching documents.
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
"""
logger.debug(f"Embedding query {query}.")
embedding_query = self.embedding.embed_documents([query])
embedding_query = self.embedding.embed_query(query)
return self.similarity_search_by_vector_with_score(
embedding_query, k=k, filter=filter
)
def similarity_search_by_vector_with_score(
self,
embedding: List[float],
k: int = 4,
filter: Optional[List[Namespace]] = None,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to the embedding and their cosine distance.
Args:
embedding: Embedding to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Optional. A list of Namespaces for filtering
the matching results.
For example:
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
will match datapoints that satisfy "red color" but not include
datapoints with "squared shape". Please refer to
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
for more detail.
Returns:
List[Tuple[Document, float]]: List of documents most similar to
the query text and cosine distance in float for each.
Lower score represents more similarity.
"""
filter = filter or []
# If the endpoint is public we use the find_neighbors function.
if self.endpoint._public_match_client:
response = self.endpoint.find_neighbors(
deployed_index_id=self._get_index_id(),
queries=embedding_query,
queries=[embedding],
num_neighbors=k,
filter=filter,
)
else:
response = self.endpoint.match(
deployed_index_id=self._get_index_id(),
queries=embedding_query,
queries=[embedding],
num_neighbors=k,
filter=filter,
)
logger.debug(f"Found {len(response)} matches.")
if len(response) == 0:
return []
logger.debug(f"Found {len(response)} matches for the query {query}.")
results = []
# I'm only getting the first one because queries receives an array
@ -212,12 +259,70 @@ class MatchingEngine(VectorStore):
# one element.
for doc in response[0]:
page_content = self._download_from_gcs(f"documents/{doc.id}")
results.append(Document(page_content=page_content))
results.append((Document(page_content=page_content), doc.distance))
logger.debug("Downloaded documents for query.")
return results
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[List[Namespace]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: The string that will be used to search for similar documents.
k: The amount of neighbors that will be retrieved.
filter: Optional. A list of Namespaces for filtering the matching results.
For example:
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
will match datapoints that satisfy "red color" but not include
datapoints with "squared shape". Please refer to
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
for more detail.
Returns:
A list of k matching documents.
"""
docs_and_scores = self.similarity_search_with_score(
query, k=k, filter=filter, **kwargs
)
return [doc for doc, _ in docs_and_scores]
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[List[Namespace]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to the embedding.
Args:
embedding: Embedding to look up documents similar to.
k: The amount of neighbors that will be retrieved.
filter: Optional. A list of Namespaces for filtering the matching results.
For example:
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
will match datapoints that satisfy "red color" but not include
datapoints with "squared shape". Please refer to
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
for more detail.
Returns:
A list of k matching documents.
"""
docs_and_scores = self.similarity_search_by_vector_with_score(
embedding, k=k, filter=filter, **kwargs
)
return [doc for doc, _ in docs_and_scores]
def _get_index_id(self) -> str:
"""Gets the correct index id for the endpoint.

Loading…
Cancel
Save