diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py index e60987f0b1..a852a5bc5e 100644 --- a/libs/langchain/langchain/vectorstores/matching_engine.py +++ b/libs/langchain/langchain/vectorstores/matching_engine.py @@ -4,7 +4,7 @@ import json import logging import time import uuid -from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type from langchain.schema.document import Document from langchain.schema.embeddings import Embeddings @@ -14,6 +14,9 @@ from langchain.utilities.vertexai import get_client_info if TYPE_CHECKING: from google.cloud import storage from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint + from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import ( + Namespace, + ) from google.oauth2.service_account import Credentials from langchain.embeddings import TensorflowHubEmbeddings @@ -169,41 +172,85 @@ class MatchingEngine(VectorStore): blob = bucket.blob(gcs_location) blob.upload_from_string(data) - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query. + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[List[Namespace]] = None, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query and their cosine distance from the query. Args: - query: The string that will be used to search for similar documents. - k: The amount of neighbors that will be retrieved. + query: String query look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Optional. A list of Namespaces for filtering + the matching results. + For example: + [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])] + will match datapoints that satisfy "red color" but not include + datapoints with "squared shape". Please refer to + https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json + for more detail. Returns: - A list of k matching documents. + List[Tuple[Document, float]]: List of documents most similar to + the query text and cosine distance in float for each. + Lower score represents more similarity. """ - logger.debug(f"Embedding query {query}.") - embedding_query = self.embedding.embed_documents([query]) + embedding_query = self.embedding.embed_query(query) + return self.similarity_search_by_vector_with_score( + embedding_query, k=k, filter=filter + ) + + def similarity_search_by_vector_with_score( + self, + embedding: List[float], + k: int = 4, + filter: Optional[List[Namespace]] = None, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to the embedding and their cosine distance. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: Optional. A list of Namespaces for filtering + the matching results. + For example: + [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])] + will match datapoints that satisfy "red color" but not include + datapoints with "squared shape". Please refer to + https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json + for more detail. + + Returns: + List[Tuple[Document, float]]: List of documents most similar to + the query text and cosine distance in float for each. + Lower score represents more similarity. + """ + filter = filter or [] # If the endpoint is public we use the find_neighbors function. if self.endpoint._public_match_client: response = self.endpoint.find_neighbors( deployed_index_id=self._get_index_id(), - queries=embedding_query, + queries=[embedding], num_neighbors=k, + filter=filter, ) else: response = self.endpoint.match( deployed_index_id=self._get_index_id(), - queries=embedding_query, + queries=[embedding], num_neighbors=k, + filter=filter, ) + logger.debug(f"Found {len(response)} matches.") + if len(response) == 0: return [] - logger.debug(f"Found {len(response)} matches for the query {query}.") - results = [] # I'm only getting the first one because queries receives an array @@ -212,12 +259,70 @@ class MatchingEngine(VectorStore): # one element. for doc in response[0]: page_content = self._download_from_gcs(f"documents/{doc.id}") - results.append(Document(page_content=page_content)) + results.append((Document(page_content=page_content), doc.distance)) logger.debug("Downloaded documents for query.") return results + def similarity_search( + self, + query: str, + k: int = 4, + filter: Optional[List[Namespace]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: The string that will be used to search for similar documents. + k: The amount of neighbors that will be retrieved. + filter: Optional. A list of Namespaces for filtering the matching results. + For example: + [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])] + will match datapoints that satisfy "red color" but not include + datapoints with "squared shape". Please refer to + https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json + for more detail. + + Returns: + A list of k matching documents. + """ + docs_and_scores = self.similarity_search_with_score( + query, k=k, filter=filter, **kwargs + ) + + return [doc for doc, _ in docs_and_scores] + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[List[Namespace]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to the embedding. + + Args: + embedding: Embedding to look up documents similar to. + k: The amount of neighbors that will be retrieved. + filter: Optional. A list of Namespaces for filtering the matching results. + For example: + [Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])] + will match datapoints that satisfy "red color" but not include + datapoints with "squared shape". Please refer to + https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json + for more detail. + + Returns: + A list of k matching documents. + """ + docs_and_scores = self.similarity_search_by_vector_with_score( + embedding, k=k, filter=filter, **kwargs + ) + + return [doc for doc, _ in docs_and_scores] + def _get_index_id(self) -> str: """Gets the correct index id for the endpoint.