mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
community[minor]: add additional support for BigQueryVectorSearch
(#15904)
BigQuery vector search lets you use GoogleSQL to do semantic search, using vector indexes for fast but approximate results, or using brute force for exact results. This PR: 1. Add `metadata[_job_ib]` in Document returned by any similarity search 2. Add `explore_job_stats` to enable users to explore job statistics and better the debuggability 3. Set the minimum row limit for running create vector index.
This commit is contained in:
parent
8799b028a6
commit
ce7723c1e5
@ -324,6 +324,24 @@
|
||||
"docs = store.similarity_search_by_vector(query_vector, filter={\"len\": 6})\n",
|
||||
"print(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Explore job satistics with BigQuery Job Id"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"job_id = \"\" # @param {type:\"string\"}\n",
|
||||
"# Debug and explore the job statistics with a BigQuery Job id.\n",
|
||||
"store.explore_job_stats(job_id)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -28,6 +28,7 @@ DEFAULT_METADATA_COLUMN_NAME = "metadata" # document metadata
|
||||
DEFAULT_CONTENT_COLUMN_NAME = "content" # text content, do not rename
|
||||
DEFAULT_TOP_K = 4 # default number of documents returned from similarity search
|
||||
|
||||
_MIN_INDEX_ROWS = 5000 # minimal number of rows for creating an index
|
||||
_INDEX_CHECK_PERIOD_SECONDS = 60 # Do not check for index more often that this.
|
||||
|
||||
_vector_table_lock = Lock() # process-wide BigQueryVectorSearch table lock
|
||||
@ -192,6 +193,11 @@ class BigQueryVectorSearch(VectorStore):
|
||||
if self._have_index or self._creating_index:
|
||||
# Already have an index or in the process of creating one.
|
||||
return
|
||||
table = self.bq_client.get_table(self.vectors_table)
|
||||
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
|
||||
# Not enough rows to create index.
|
||||
self._logger.debug("Not enough rows to create a vector index.")
|
||||
return
|
||||
if (
|
||||
datetime.utcnow() - self._last_index_check
|
||||
).total_seconds() < _INDEX_CHECK_PERIOD_SECONDS:
|
||||
@ -228,6 +234,10 @@ class BigQueryVectorSearch(VectorStore):
|
||||
def _create_index(self):
|
||||
from google.api_core.exceptions import ClientError
|
||||
|
||||
table = self.bq_client.get_table(self.vectors_table)
|
||||
if (table.num_rows or 0) < _MIN_INDEX_ROWS:
|
||||
# Not enough rows to create index.
|
||||
return
|
||||
if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
|
||||
distance_type = "EUCLIDEAN"
|
||||
elif self.distance_strategy == DistanceStrategy.COSINE:
|
||||
@ -534,6 +544,7 @@ class BigQueryVectorSearch(VectorStore):
|
||||
else:
|
||||
metadata = {}
|
||||
metadata["__id"] = row[self.doc_id_field]
|
||||
metadata["__job_id"] = job.job_id
|
||||
doc = Document(page_content=row[self.content_field], metadata=metadata)
|
||||
document_tuples.append(
|
||||
(doc, row[self.text_embedding_field], row["_vector_search_distance"])
|
||||
@ -833,3 +844,14 @@ class BigQueryVectorSearch(VectorStore):
|
||||
vs_obj = BigQueryVectorSearch(embedding=embedding, **kwargs)
|
||||
vs_obj.add_texts(texts, metadatas)
|
||||
return vs_obj
|
||||
|
||||
def explore_job_stats(self, job_id: str) -> Dict:
|
||||
"""Return the statistics for a single job execution.
|
||||
|
||||
Args:
|
||||
job_id: The BigQuery Job id.
|
||||
|
||||
Returns:
|
||||
A dictionary of job statistics for a given job.
|
||||
"""
|
||||
return self.bq_client.get_job(job_id)._properties["statistics"]
|
||||
|
Loading…
Reference in New Issue
Block a user