From ce7723c1e5ea672d9f585ffbde82edabed880cb4 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Mon, 15 Jan 2024 11:45:15 -0700 Subject: [PATCH] community[minor]: add additional support for `BigQueryVectorSearch` (#15904) BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results. This PR: 1. Add `metadata[_job_ib]` in Document returned by any similarity search 2. Add `explore_job_stats` to enable users to explore job statistics and better the debuggability 3. Set the minimum row limit for running create vector index. --- .../vectorstores/bigquery_vector_search.ipynb | 18 +++++++++++++++ .../vectorstores/bigquery_vector_search.py | 22 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb index 81f31bdae5..29b9430871 100644 --- a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb +++ b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb @@ -324,6 +324,24 @@ "docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n", "print(docs)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explore job satistics with BigQuery Job Id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "job_id = \"\" # @param {type:\"string\"}\n", + "# Debug and explore the job statistics with a BigQuery Job id.\n", + "store.explore_job_stats(job_id)" + ] } ], "metadata": { diff --git a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py index d132e7e071..64a1f4b765 100644 --- a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py +++ b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py @@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata" # document metadata DEFAULT_CONTENT_COLUMN_NAME = "content" # text content, do not rename DEFAULT_TOP_K = 4 # default number of documents returned from similarity search +_MIN_INDEX_ROWS = 5000 # minimal number of rows for creating an index _INDEX_CHECK_PERIOD_SECONDS = 60 # Do not check for index more often that this. _vector_table_lock = Lock() # process-wide BigQueryVectorSearch table lock @@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore): if self._have_index or self._creating_index: # Already have an index or in the process of creating one. return + table = self.bq_client.get_table(self.vectors_table) + if (table.num_rows or 0) < _MIN_INDEX_ROWS: + # Not enough rows to create index. + self._logger.debug("Not enough rows to create a vector index.") + return if ( datetime.utcnow() - self._last_index_check ).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS: @@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore): def _create_index(self): from google.api_core.exceptions import ClientError + table = self.bq_client.get_table(self.vectors_table) + if (table.num_rows or 0) < _MIN_INDEX_ROWS: + # Not enough rows to create index. + return if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE: distance_type = "EUCLIDEAN" elif self.distance_strategy == DistanceStrategy.COSINE: @@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore): else: metadata = {} metadata["__id"] = row[self.doc_id_field] + metadata["__job_id"] = job.job_id doc = Document(page_content=row[self.content_field], metadata=metadata) document_tuples.append( (doc, row[self.text_embedding_field], row["_vector_search_distance"]) @@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore): vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs) vs_obj.add_texts(texts, metadatas) return vs_obj + + def explore_job_stats(self, job_id: str) -> Dict: + """Return the statistics for a single job execution. + + Args: + job_id: The BigQuery Job id. + + Returns: + A dictionary of job statistics for a given job. + """ + return self.bq_client.get_job(job_id)._properties["statistics"]