|
|
|
@ -4,7 +4,7 @@ import json
|
|
|
|
|
import logging
|
|
|
|
|
import time
|
|
|
|
|
import uuid
|
|
|
|
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type
|
|
|
|
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
|
|
|
|
|
|
|
|
|
from langchain.schema.document import Document
|
|
|
|
|
from langchain.schema.embeddings import Embeddings
|
|
|
|
@ -14,6 +14,9 @@ from langchain.utilities.vertexai import get_client_info
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
|
from google.cloud import storage
|
|
|
|
|
from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
|
|
|
|
|
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
|
|
|
|
|
Namespace,
|
|
|
|
|
)
|
|
|
|
|
from google.oauth2.service_account import Credentials
|
|
|
|
|
|
|
|
|
|
from langchain.embeddings import TensorflowHubEmbeddings
|
|
|
|
@ -169,41 +172,85 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
blob = bucket.blob(gcs_location)
|
|
|
|
|
blob.upload_from_string(data)
|
|
|
|
|
|
|
|
|
|
def similarity_search(
|
|
|
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
def similarity_search_with_score(
|
|
|
|
|
self,
|
|
|
|
|
query: str,
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[List[Namespace]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to query and their cosine distance from the query.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: The string that will be used to search for similar documents.
|
|
|
|
|
k: The amount of neighbors that will be retrieved.
|
|
|
|
|
query: String query look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Optional. A list of Namespaces for filtering
|
|
|
|
|
the matching results.
|
|
|
|
|
For example:
|
|
|
|
|
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
|
|
|
|
will match datapoints that satisfy "red color" but not include
|
|
|
|
|
datapoints with "squared shape". Please refer to
|
|
|
|
|
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
|
|
|
|
for more detail.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of k matching documents.
|
|
|
|
|
List[Tuple[Document, float]]: List of documents most similar to
|
|
|
|
|
the query text and cosine distance in float for each.
|
|
|
|
|
Lower score represents more similarity.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
logger.debug(f"Embedding query {query}.")
|
|
|
|
|
embedding_query = self.embedding.embed_documents([query])
|
|
|
|
|
embedding_query = self.embedding.embed_query(query)
|
|
|
|
|
return self.similarity_search_by_vector_with_score(
|
|
|
|
|
embedding_query, k=k, filter=filter
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def similarity_search_by_vector_with_score(
|
|
|
|
|
self,
|
|
|
|
|
embedding: List[float],
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[List[Namespace]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to the embedding and their cosine distance.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Optional. A list of Namespaces for filtering
|
|
|
|
|
the matching results.
|
|
|
|
|
For example:
|
|
|
|
|
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
|
|
|
|
will match datapoints that satisfy "red color" but not include
|
|
|
|
|
datapoints with "squared shape". Please refer to
|
|
|
|
|
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
|
|
|
|
for more detail.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List[Tuple[Document, float]]: List of documents most similar to
|
|
|
|
|
the query text and cosine distance in float for each.
|
|
|
|
|
Lower score represents more similarity.
|
|
|
|
|
"""
|
|
|
|
|
filter = filter or []
|
|
|
|
|
|
|
|
|
|
# If the endpoint is public we use the find_neighbors function.
|
|
|
|
|
if self.endpoint._public_match_client:
|
|
|
|
|
response = self.endpoint.find_neighbors(
|
|
|
|
|
deployed_index_id=self._get_index_id(),
|
|
|
|
|
queries=embedding_query,
|
|
|
|
|
queries=[embedding],
|
|
|
|
|
num_neighbors=k,
|
|
|
|
|
filter=filter,
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
response = self.endpoint.match(
|
|
|
|
|
deployed_index_id=self._get_index_id(),
|
|
|
|
|
queries=embedding_query,
|
|
|
|
|
queries=[embedding],
|
|
|
|
|
num_neighbors=k,
|
|
|
|
|
filter=filter,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.debug(f"Found {len(response)} matches.")
|
|
|
|
|
|
|
|
|
|
if len(response) == 0:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
logger.debug(f"Found {len(response)} matches for the query {query}.")
|
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
|
|
|
|
|
# I'm only getting the first one because queries receives an array
|
|
|
|
@ -212,12 +259,70 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
# one element.
|
|
|
|
|
for doc in response[0]:
|
|
|
|
|
page_content = self._download_from_gcs(f"documents/{doc.id}")
|
|
|
|
|
results.append(Document(page_content=page_content))
|
|
|
|
|
results.append((Document(page_content=page_content), doc.distance))
|
|
|
|
|
|
|
|
|
|
logger.debug("Downloaded documents for query.")
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
def similarity_search(
|
|
|
|
|
self,
|
|
|
|
|
query: str,
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[List[Namespace]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: The string that will be used to search for similar documents.
|
|
|
|
|
k: The amount of neighbors that will be retrieved.
|
|
|
|
|
filter: Optional. A list of Namespaces for filtering the matching results.
|
|
|
|
|
For example:
|
|
|
|
|
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
|
|
|
|
will match datapoints that satisfy "red color" but not include
|
|
|
|
|
datapoints with "squared shape". Please refer to
|
|
|
|
|
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
|
|
|
|
for more detail.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of k matching documents.
|
|
|
|
|
"""
|
|
|
|
|
docs_and_scores = self.similarity_search_with_score(
|
|
|
|
|
query, k=k, filter=filter, **kwargs
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
|
|
|
|
|
def similarity_search_by_vector(
|
|
|
|
|
self,
|
|
|
|
|
embedding: List[float],
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[List[Namespace]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to the embedding.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: The amount of neighbors that will be retrieved.
|
|
|
|
|
filter: Optional. A list of Namespaces for filtering the matching results.
|
|
|
|
|
For example:
|
|
|
|
|
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
|
|
|
|
will match datapoints that satisfy "red color" but not include
|
|
|
|
|
datapoints with "squared shape". Please refer to
|
|
|
|
|
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
|
|
|
|
for more detail.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A list of k matching documents.
|
|
|
|
|
"""
|
|
|
|
|
docs_and_scores = self.similarity_search_by_vector_with_score(
|
|
|
|
|
embedding, k=k, filter=filter, **kwargs
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
|
|
|
|
|
|
def _get_index_id(self) -> str:
|
|
|
|
|
"""Gets the correct index id for the endpoint.
|
|
|
|
|
|
|
|
|
|