community[minor]: add additional support for `BigQueryVectorSearch` (#15904)

BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results. This PR: 1. Add `metadata[_job_ib]` in Document returned by any similarity search 2. Add `explore_job_stats` to enable users to explore job statistics and better the debuggability 3. Set the minimum row limit for running create vector index.
9 months ago · ce7723c1e5
parent 8799b028a6
commit ce7723c1e5
2 changed files with 40 additions and 0 deletions
--- a/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb
+++ b/docs/docs/integrations/vectorstores/bigquery_vector_search.ipynb
@ -324,6 +324,24 @@
    "docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
    "print(docs)"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Explore job satistics with BigQuery Job Id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "job_id = \"\"  # @param {type:\"string\"}\n",
+    "# Debug and explore the job statistics with a BigQuery Job id.\n",
+    "store.explore_job_stats(job_id)"
+   ]
  }
 ],
 "metadata": {
--- a/libs/community/langchain_community/vectorstores/bigquery_vector_search.py
+++ b/libs/community/langchain_community/vectorstores/bigquery_vector_search.py
@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata"  # document metadata
 DEFAULT_CONTENT_COLUMN_NAME = "content"  # text content, do not rename
 DEFAULT_TOP_K = 4  # default number of documents returned from similarity search

+_MIN_INDEX_ROWS = 5000  # minimal number of rows for creating an index
 _INDEX_CHECK_PERIOD_SECONDS = 60  # Do not check for index more often that this.

 _vector_table_lock = Lock()  # process-wide BigQueryVectorSearch table lock
@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore):
        if self._have_index or self._creating_index:
            # Already have an index or in the process of creating one.
            return
+        table = self.bq_client.get_table(self.vectors_table)
+        if (table.num_rows or 0) < _MIN_INDEX_ROWS:
+            # Not enough rows to create index.
+            self._logger.debug("Not enough rows to create a vector index.")
+            return
        if (
            datetime.utcnow() - self._last_index_check
        ).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore):
    def _create_index(self):
        from google.api_core.exceptions import ClientError

+        table = self.bq_client.get_table(self.vectors_table)
+        if (table.num_rows or 0) < _MIN_INDEX_ROWS:
+            # Not enough rows to create index.
+            return
        if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
            distance_type = "EUCLIDEAN"
        elif self.distance_strategy == DistanceStrategy.COSINE:
@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore):
            else:
                metadata = {}
            metadata["__id"] = row[self.doc_id_field]
+            metadata["__job_id"] = job.job_id
            doc = Document(page_content=row[self.content_field], metadata=metadata)
            document_tuples.append(
                (doc, row[self.text_embedding_field], row["_vector_search_distance"])
@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore):
        vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
        vs_obj.add_texts(texts, metadatas)
        return vs_obj
+
+    def explore_job_stats(self, job_id: str) -> Dict:
+        """Return the statistics for a single job execution.
+
+        Args:
+            job_id: The BigQuery Job id.
+
+        Returns:
+            A dictionary of job statistics for a given job.
+        """
+        return self.bq_client.get_job(job_id)._properties["statistics"]