community[minor]: add additional support for BigQueryVectorSearch (#15904)

BigQuery vector search lets you use GoogleSQL to do semantic search,
using vector indexes for fast but approximate results, or using brute
force for exact results.

This PR:
1. Add `metadata[_job_ib]` in Document returned by any similarity search
2. Add `explore_job_stats` to enable users to explore job statistics and
better the debuggability
3. Set the minimum row limit for running create vector index.
This commit is contained in:
Ashley Xu 2024-01-15 11:45:15 -07:00 committed by GitHub
parent 8799b028a6
commit ce7723c1e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 0 deletions

View File

@ -324,6 +324,24 @@
"docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
"print(docs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Explore job satistics with BigQuery Job Id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"job_id = \"\" # @param {type:\"string\"}\n",
"# Debug and explore the job statistics with a BigQuery Job id.\n",
"store.explore_job_stats(job_id)"
]
}
],
"metadata": {

View File

@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata" # document metadata
DEFAULT_CONTENT_COLUMN_NAME = "content" # text content, do not rename
DEFAULT_TOP_K = 4 # default number of documents returned from similarity search
_MIN_INDEX_ROWS = 5000 # minimal number of rows for creating an index
_INDEX_CHECK_PERIOD_SECONDS = 60 # Do not check for index more often that this.
_vector_table_lock = Lock() # process-wide BigQueryVectorSearch table lock
@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore):
if self._have_index or self._creating_index:
# Already have an index or in the process of creating one.
return
table = self.bq_client.get_table(self.vectors_table)
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
# Not enough rows to create index.
self._logger.debug("Not enough rows to create a vector index.")
return
if (
datetime.utcnow() - self._last_index_check
).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore):
def _create_index(self):
from google.api_core.exceptions import ClientError
table = self.bq_client.get_table(self.vectors_table)
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
# Not enough rows to create index.
return
if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
distance_type = "EUCLIDEAN"
elif self.distance_strategy == DistanceStrategy.COSINE:
@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore):
else:
metadata = {}
metadata["__id"] = row[self.doc_id_field]
metadata["__job_id"] = job.job_id
doc = Document(page_content=row[self.content_field], metadata=metadata)
document_tuples.append(
(doc, row[self.text_embedding_field], row["_vector_search_distance"])
@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore):
vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
vs_obj.add_texts(texts, metadatas)
return vs_obj
def explore_job_stats(self, job_id: str) -> Dict:
"""Return the statistics for a single job execution.
Args:
job_id: The BigQuery Job id.
Returns:
A dictionary of job statistics for a given job.
"""
return self.bq_client.get_job(job_id)._properties["statistics"]