|
|
|
@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
|
|
|
|
|
|
|
from langchain_community.embeddings import TensorflowHubEmbeddings
|
|
|
|
|
|
|
|
|
|
logger = logging.getLogger()
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MatchingEngine(VectorStore):
|
|
|
|
@ -49,6 +49,8 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
gcs_client: storage.Client,
|
|
|
|
|
gcs_bucket_name: str,
|
|
|
|
|
credentials: Optional[Credentials] = None,
|
|
|
|
|
*,
|
|
|
|
|
document_id_key: Optional[str] = None,
|
|
|
|
|
):
|
|
|
|
|
"""Google Vertex AI Vector Search (previously Matching Engine)
|
|
|
|
|
implementation of the vector store.
|
|
|
|
@ -78,6 +80,9 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
gcs_client: The GCS client.
|
|
|
|
|
gcs_bucket_name: The GCS bucket name.
|
|
|
|
|
credentials (Optional): Created GCP credentials.
|
|
|
|
|
document_id_key (Optional): Key for storing document ID in document
|
|
|
|
|
metadata. If None, document ID will not be returned in document
|
|
|
|
|
metadata.
|
|
|
|
|
"""
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._validate_google_libraries_installation()
|
|
|
|
@ -89,6 +94,7 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
self.gcs_client = gcs_client
|
|
|
|
|
self.credentials = credentials
|
|
|
|
|
self.gcs_bucket_name = gcs_bucket_name
|
|
|
|
|
self.document_id_key = document_id_key
|
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def embeddings(self) -> Embeddings:
|
|
|
|
@ -229,6 +235,7 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
List[Tuple[Document, float]]: List of documents most similar to
|
|
|
|
|
the query text and cosine distance in float for each.
|
|
|
|
|
Lower score represents more similarity.
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
filter = filter or []
|
|
|
|
|
|
|
|
|
@ -255,19 +262,27 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
if len(response) == 0:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
results = []
|
|
|
|
|
docs: List[Tuple[Document, float]] = []
|
|
|
|
|
|
|
|
|
|
# I'm only getting the first one because queries receives an array
|
|
|
|
|
# and the similarity_search method only receives one query. This
|
|
|
|
|
# means that the match method will always return an array with only
|
|
|
|
|
# one element.
|
|
|
|
|
for doc in response[0]:
|
|
|
|
|
page_content = self._download_from_gcs(f"documents/{doc.id}")
|
|
|
|
|
results.append((Document(page_content=page_content), doc.distance))
|
|
|
|
|
for result in response[0]:
|
|
|
|
|
page_content = self._download_from_gcs(f"documents/{result.id}")
|
|
|
|
|
# TODO: return all metadata.
|
|
|
|
|
metadata = {}
|
|
|
|
|
if self.document_id_key is not None:
|
|
|
|
|
metadata[self.document_id_key] = result.id
|
|
|
|
|
document = Document(
|
|
|
|
|
page_content=page_content,
|
|
|
|
|
metadata=metadata,
|
|
|
|
|
)
|
|
|
|
|
docs.append((document, result.distance))
|
|
|
|
|
|
|
|
|
|
logger.debug("Downloaded documents for query.")
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
|
def similarity_search(
|
|
|
|
|
self,
|
|
|
|
@ -382,6 +397,7 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
endpoint_id: str,
|
|
|
|
|
credentials_path: Optional[str] = None,
|
|
|
|
|
embedding: Optional[Embeddings] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> "MatchingEngine":
|
|
|
|
|
"""Takes the object creation out of the constructor.
|
|
|
|
|
|
|
|
|
@ -397,6 +413,7 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
the local file system.
|
|
|
|
|
embedding: The :class:`Embeddings` that will be used for
|
|
|
|
|
embedding the texts.
|
|
|
|
|
kwargs: Additional keyword arguments to pass to MatchingEngine.__init__().
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
A configured MatchingEngine with the texts added to the index.
|
|
|
|
@ -419,6 +436,7 @@ class MatchingEngine(VectorStore):
|
|
|
|
|
gcs_client=gcs_client,
|
|
|
|
|
credentials=credentials,
|
|
|
|
|
gcs_bucket_name=gcs_bucket_name,
|
|
|
|
|
**kwargs,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|