|
|
|
@ -67,82 +67,6 @@ def _unique_list(lst: List[T], key: Callable[[T], U]) -> List[T]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AstraDBVectorStore(VectorStore):
|
|
|
|
|
"""Wrapper around DataStax Astra DB for vector-store workloads.
|
|
|
|
|
|
|
|
|
|
For quickstart and details, visit:
|
|
|
|
|
docs.datastax.com/en/astra/home/astra.html
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
from langchain_astradb.vectorstores import AstraDBVectorStore
|
|
|
|
|
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
|
|
|
|
|
|
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
|
vectorstore = AstraDBVectorStore(
|
|
|
|
|
embedding=embeddings,
|
|
|
|
|
collection_name="my_store",
|
|
|
|
|
token="AstraCS:...",
|
|
|
|
|
api_endpoint="https://<DB-ID>-<REGION>.apps.astra.datastax.com"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
vectorstore.add_texts(["Giraffes", "All good here"])
|
|
|
|
|
results = vectorstore.similarity_search("Everything's ok", k=1)
|
|
|
|
|
|
|
|
|
|
Constructor Args (only keyword-arguments accepted):
|
|
|
|
|
embedding (Embeddings): embedding function to use.
|
|
|
|
|
collection_name (str): name of the Astra DB collection to create/use.
|
|
|
|
|
token (Optional[str]): API token for Astra DB usage.
|
|
|
|
|
api_endpoint (Optional[str]): full URL to the API endpoint,
|
|
|
|
|
such as "https://<DB-ID>-us-east1.apps.astra.datastax.com".
|
|
|
|
|
astra_db_client (Optional[astrapy.db.AstraDB]):
|
|
|
|
|
*alternative to token+api_endpoint*,
|
|
|
|
|
you can pass an already-created 'astrapy.db.AstraDB' instance.
|
|
|
|
|
async_astra_db_client (Optional[astrapy.db.AsyncAstraDB]):
|
|
|
|
|
same as `astra_db_client`, but the basis for the async API
|
|
|
|
|
of the vector store.
|
|
|
|
|
namespace (Optional[str]): namespace (aka keyspace) where the
|
|
|
|
|
collection is created. Defaults to the database's "default namespace".
|
|
|
|
|
metric (Optional[str]): similarity function to use out of those
|
|
|
|
|
available in Astra DB. If left out, it will use Astra DB API's
|
|
|
|
|
defaults (i.e. "cosine" - but, for performance reasons,
|
|
|
|
|
"dot_product" is suggested if embeddings are normalized to one).
|
|
|
|
|
|
|
|
|
|
Advanced arguments (coming with sensible defaults):
|
|
|
|
|
batch_size (Optional[int]): Size of batches for bulk insertions.
|
|
|
|
|
bulk_insert_batch_concurrency (Optional[int]): Number of threads
|
|
|
|
|
to insert batches concurrently.
|
|
|
|
|
bulk_insert_overwrite_concurrency (Optional[int]): Number of
|
|
|
|
|
threads in a batch to insert pre-existing entries.
|
|
|
|
|
bulk_delete_concurrency (Optional[int]): Number of threads
|
|
|
|
|
(for deleting multiple rows concurrently).
|
|
|
|
|
pre_delete_collection (Optional[bool]): whether to delete the collection
|
|
|
|
|
before creating it. If False and the collection already exists,
|
|
|
|
|
the collection will be used as is.
|
|
|
|
|
|
|
|
|
|
A note on concurrency: as a rule of thumb, on a typical client machine
|
|
|
|
|
it is suggested to keep the quantity
|
|
|
|
|
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
|
|
|
|
|
much below 1000 to avoid exhausting the client multithreading/networking
|
|
|
|
|
resources. The hardcoded defaults are somewhat conservative to meet
|
|
|
|
|
most machines' specs, but a sensible choice to test may be:
|
|
|
|
|
bulk_insert_batch_concurrency = 80
|
|
|
|
|
bulk_insert_overwrite_concurrency = 10
|
|
|
|
|
A bit of experimentation is required to nail the best results here,
|
|
|
|
|
depending on both the machine/network specs and the expected workload
|
|
|
|
|
(specifically, how often a write is an update of an existing id).
|
|
|
|
|
Remember you can pass concurrency settings to individual calls to
|
|
|
|
|
add_texts and add_documents as well.
|
|
|
|
|
|
|
|
|
|
A note on passing astra_db_client and/or async_astra_db_client instead
|
|
|
|
|
of the credentials (token, api_endpoint):
|
|
|
|
|
- if you pass only the async client when creating the store,
|
|
|
|
|
the sync methods will error when called.
|
|
|
|
|
- conversely, if you pass only the sync client, the async methods will
|
|
|
|
|
still be available, but will be wrapping its sync counterpart
|
|
|
|
|
in a `run_in_executor` construct instead of using the native async.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def _filter_to_metadata(filter_dict: Optional[Dict[str, Any]]) -> Dict[str, Any]:
|
|
|
|
|
if filter_dict is None:
|
|
|
|
@ -180,10 +104,71 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
bulk_delete_concurrency: Optional[int] = None,
|
|
|
|
|
pre_delete_collection: bool = False,
|
|
|
|
|
) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Create an AstraDBVectorStore vector store object. See class docstring for help.
|
|
|
|
|
"""
|
|
|
|
|
"""Wrapper around DataStax Astra DB for vector-store workloads.
|
|
|
|
|
|
|
|
|
|
For quickstart and details, visit
|
|
|
|
|
https://docs.datastax.com/en/astra/astra-db-vector/
|
|
|
|
|
|
|
|
|
|
Example:
|
|
|
|
|
.. code-block:: python
|
|
|
|
|
|
|
|
|
|
from langchain_astradb.vectorstores import AstraDBVectorStore
|
|
|
|
|
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
|
|
|
embeddings = OpenAIEmbeddings()
|
|
|
|
|
vectorstore = AstraDBVectorStore(
|
|
|
|
|
embedding=embeddings,
|
|
|
|
|
collection_name="my_store",
|
|
|
|
|
token="AstraCS:...",
|
|
|
|
|
api_endpoint="https://<DB-ID>-<REGION>.apps.astra.datastax.com"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
vectorstore.add_texts(["Giraffes", "All good here"])
|
|
|
|
|
results = vectorstore.similarity_search("Everything's ok", k=1)
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: embedding function to use.
|
|
|
|
|
collection_name: name of the Astra DB collection to create/use.
|
|
|
|
|
token: API token for Astra DB usage.
|
|
|
|
|
api_endpoint: full URL to the API endpoint, such as
|
|
|
|
|
`https://<DB-ID>-us-east1.apps.astra.datastax.com`.
|
|
|
|
|
astra_db_client: *alternative to token+api_endpoint*,
|
|
|
|
|
you can pass an already-created 'astrapy.db.AstraDB' instance.
|
|
|
|
|
async_astra_db_client: *alternative to token+api_endpoint*,
|
|
|
|
|
you can pass an already-created 'astrapy.db.AsyncAstraDB' instance.
|
|
|
|
|
namespace: namespace (aka keyspace) where the collection is created.
|
|
|
|
|
Defaults to the database's "default namespace".
|
|
|
|
|
metric: similarity function to use out of those available in Astra DB.
|
|
|
|
|
If left out, it will use Astra DB API's defaults (i.e. "cosine" - but,
|
|
|
|
|
for performance reasons, "dot_product" is suggested if embeddings are
|
|
|
|
|
normalized to one).
|
|
|
|
|
batch_size: Size of batches for bulk insertions.
|
|
|
|
|
bulk_insert_batch_concurrency: Number of threads or coroutines to insert
|
|
|
|
|
batches concurrently.
|
|
|
|
|
bulk_insert_overwrite_concurrency: Number of threads or coroutines in a
|
|
|
|
|
batch to insert pre-existing entries.
|
|
|
|
|
bulk_delete_concurrency: Number of threads (for deleting multiple rows
|
|
|
|
|
concurrently).
|
|
|
|
|
pre_delete_collection: whether to delete the collection before creating it.
|
|
|
|
|
If False and the collection already exists, the collection will be used
|
|
|
|
|
as is.
|
|
|
|
|
|
|
|
|
|
Note:
|
|
|
|
|
For concurrency in synchronous :meth:`~add_texts`:, as a rule of thumb, on a
|
|
|
|
|
typical client machine it is suggested to keep the quantity
|
|
|
|
|
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
|
|
|
|
|
much below 1000 to avoid exhausting the client multithreading/networking
|
|
|
|
|
resources. The hardcoded defaults are somewhat conservative to meet
|
|
|
|
|
most machines' specs, but a sensible choice to test may be:
|
|
|
|
|
|
|
|
|
|
- bulk_insert_batch_concurrency = 80
|
|
|
|
|
- bulk_insert_overwrite_concurrency = 10
|
|
|
|
|
|
|
|
|
|
A bit of experimentation is required to nail the best results here,
|
|
|
|
|
depending on both the machine/network specs and the expected workload
|
|
|
|
|
(specifically, how often a write is an update of an existing id).
|
|
|
|
|
Remember you can pass concurrency settings to individual calls to
|
|
|
|
|
:meth:`~add_texts` and :meth:`~add_documents` as well.
|
|
|
|
|
"""
|
|
|
|
|
# Conflicting-arg checks:
|
|
|
|
|
if astra_db_client is not None or async_astra_db_client is not None:
|
|
|
|
|
if token is not None or api_endpoint is not None:
|
|
|
|
@ -349,8 +334,13 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
|
|
|
|
|
def delete_by_document_id(self, document_id: str) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Remove a single document from the store, given its document_id (str).
|
|
|
|
|
Return True if a document has indeed been deleted, False if ID not found.
|
|
|
|
|
Remove a single document from the store, given its document ID.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
document_id: The document ID
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
True if a document has indeed been deleted, False if ID not found.
|
|
|
|
|
"""
|
|
|
|
|
self._ensure_astra_db_client()
|
|
|
|
|
# self.collection is not None (by _ensure_astra_db_client)
|
|
|
|
@ -361,8 +351,13 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
|
|
|
|
|
async def adelete_by_document_id(self, document_id: str) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Remove a single document from the store, given its document_id (str).
|
|
|
|
|
Return True if a document has indeed been deleted, False if ID not found.
|
|
|
|
|
Remove a single document from the store, given its document ID.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
document_id: The document ID
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
True if a document has indeed been deleted, False if ID not found.
|
|
|
|
|
"""
|
|
|
|
|
await self._ensure_db_setup()
|
|
|
|
|
if not self.async_collection:
|
|
|
|
@ -381,13 +376,12 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
"""Delete by vector ids.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
ids (Optional[List[str]]): List of ids to delete.
|
|
|
|
|
concurrency (Optional[int]): max number of threads issuing
|
|
|
|
|
single-doc delete requests. Defaults to instance-level setting.
|
|
|
|
|
ids: List of ids to delete.
|
|
|
|
|
concurrency: max number of threads issuing single-doc delete requests.
|
|
|
|
|
Defaults to instance-level setting.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Optional[bool]: True if deletion is successful,
|
|
|
|
|
False otherwise, None if not implemented.
|
|
|
|
|
True if deletion is successful, False otherwise.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if kwargs:
|
|
|
|
@ -416,17 +410,15 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
concurrency: Optional[int] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> Optional[bool]:
|
|
|
|
|
"""Delete by vector ID or other criteria.
|
|
|
|
|
"""Delete by vector ids.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
ids: List of ids to delete.
|
|
|
|
|
concurrency (Optional[int]): max number of concurrent delete queries.
|
|
|
|
|
concurrency: max concurrency of single-doc delete requests.
|
|
|
|
|
Defaults to instance-level setting.
|
|
|
|
|
**kwargs: Other keyword arguments that subclasses might use.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Optional[bool]: True if deletion is successful,
|
|
|
|
|
False otherwise, None if not implemented.
|
|
|
|
|
True if deletion is successful, False otherwise.
|
|
|
|
|
"""
|
|
|
|
|
if kwargs:
|
|
|
|
|
warnings.warn(
|
|
|
|
@ -447,7 +439,7 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
def delete_collection(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Completely delete the collection from the database (as opposed
|
|
|
|
|
to 'clear()', which empties it only).
|
|
|
|
|
to :meth:`~clear`, which empties it only).
|
|
|
|
|
Stored data is lost and unrecoverable, resources are freed.
|
|
|
|
|
Use with caution.
|
|
|
|
|
"""
|
|
|
|
@ -460,7 +452,7 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
async def adelete_collection(self) -> None:
|
|
|
|
|
"""
|
|
|
|
|
Completely delete the collection from the database (as opposed
|
|
|
|
|
to 'clear()', which empties it only).
|
|
|
|
|
to :meth:`~aclear`, which empties it only).
|
|
|
|
|
Stored data is lost and unrecoverable, resources are freed.
|
|
|
|
|
Use with caution.
|
|
|
|
|
"""
|
|
|
|
@ -553,28 +545,29 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
will be replaced.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
texts (Iterable[str]): Texts to add to the vectorstore.
|
|
|
|
|
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
|
|
|
|
ids (Optional[List[str]], optional): Optional list of ids.
|
|
|
|
|
batch_size (Optional[int]): Number of documents in each API call.
|
|
|
|
|
texts: Texts to add to the vectorstore.
|
|
|
|
|
metadatas: Optional list of metadatas.
|
|
|
|
|
ids: Optional list of ids.
|
|
|
|
|
batch_size: Number of documents in each API call.
|
|
|
|
|
Check the underlying Astra DB HTTP API specs for the max value
|
|
|
|
|
(20 at the time of writing this). If not provided, defaults
|
|
|
|
|
to the instance-level setting.
|
|
|
|
|
batch_concurrency (Optional[int]): number of threads to process
|
|
|
|
|
batch_concurrency: number of threads to process
|
|
|
|
|
insertion batches concurrently. Defaults to instance-level
|
|
|
|
|
setting if not provided.
|
|
|
|
|
overwrite_concurrency (Optional[int]): number of threads to process
|
|
|
|
|
overwrite_concurrency: number of threads to process
|
|
|
|
|
pre-existing documents in each batch (which require individual
|
|
|
|
|
API calls). Defaults to instance-level setting if not provided.
|
|
|
|
|
|
|
|
|
|
A note on metadata: there are constraints on the allowed field names
|
|
|
|
|
in this dictionary, coming from the underlying Astra DB API.
|
|
|
|
|
For instance, the `$` (dollar sign) cannot be used in the dict keys.
|
|
|
|
|
See this document for details:
|
|
|
|
|
docs.datastax.com/en/astra-serverless/docs/develop/dev-with-json.html
|
|
|
|
|
Note:
|
|
|
|
|
There are constraints on the allowed field names
|
|
|
|
|
in the metadata dictionaries, coming from the underlying Astra DB API.
|
|
|
|
|
For instance, the `$` (dollar sign) cannot be used in the dict keys.
|
|
|
|
|
See this document for details:
|
|
|
|
|
https://docs.datastax.com/en/astra/astra-db-vector/api-reference/data-api.html
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List[str]: List of ids of the added texts.
|
|
|
|
|
The list of ids of the added texts.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
if kwargs:
|
|
|
|
@ -649,27 +642,29 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
will be replaced.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
texts (Iterable[str]): Texts to add to the vectorstore.
|
|
|
|
|
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
|
|
|
|
ids (Optional[List[str]], optional): Optional list of ids.
|
|
|
|
|
batch_size (Optional[int]): Number of documents in each API call.
|
|
|
|
|
texts: Texts to add to the vectorstore.
|
|
|
|
|
metadatas: Optional list of metadatas.
|
|
|
|
|
ids: Optional list of ids.
|
|
|
|
|
batch_size: Number of documents in each API call.
|
|
|
|
|
Check the underlying Astra DB HTTP API specs for the max value
|
|
|
|
|
(20 at the time of writing this). If not provided, defaults
|
|
|
|
|
to the instance-level setting.
|
|
|
|
|
batch_concurrency (Optional[int]): number of concurrent batch insertions.
|
|
|
|
|
Defaults to instance-level setting if not provided.
|
|
|
|
|
overwrite_concurrency (Optional[int]): number of concurrent API calls to
|
|
|
|
|
process pre-existing documents in each batch.
|
|
|
|
|
Defaults to instance-level setting if not provided.
|
|
|
|
|
|
|
|
|
|
A note on metadata: there are constraints on the allowed field names
|
|
|
|
|
in this dictionary, coming from the underlying Astra DB API.
|
|
|
|
|
For instance, the `$` (dollar sign) cannot be used in the dict keys.
|
|
|
|
|
See this document for details:
|
|
|
|
|
docs.datastax.com/en/astra-serverless/docs/develop/dev-with-json.html
|
|
|
|
|
batch_concurrency: number of threads to process
|
|
|
|
|
insertion batches concurrently. Defaults to instance-level
|
|
|
|
|
setting if not provided.
|
|
|
|
|
overwrite_concurrency: number of threads to process
|
|
|
|
|
pre-existing documents in each batch (which require individual
|
|
|
|
|
API calls). Defaults to instance-level setting if not provided.
|
|
|
|
|
|
|
|
|
|
Note:
|
|
|
|
|
There are constraints on the allowed field names
|
|
|
|
|
in the metadata dictionaries, coming from the underlying Astra DB API.
|
|
|
|
|
For instance, the `$` (dollar sign) cannot be used in the dict keys.
|
|
|
|
|
See this document for details:
|
|
|
|
|
https://docs.datastax.com/en/astra/astra-db-vector/api-reference/data-api.html
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List[str]: List of ids of the added texts.
|
|
|
|
|
The list of ids of the added texts.
|
|
|
|
|
"""
|
|
|
|
|
await self._ensure_db_setup()
|
|
|
|
|
if not self.async_collection:
|
|
|
|
@ -744,13 +739,15 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float, str]]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
"""Return docs most similar to embedding vector with score and id.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding (str): Embedding to look up documents similar to.
|
|
|
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of (Document, score, id), the most similar to the query vector.
|
|
|
|
|
The list of (Document, score, id), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
self._ensure_astra_db_client()
|
|
|
|
|
metadata_parameter = self._filter_to_metadata(filter)
|
|
|
|
@ -787,13 +784,15 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float, str]]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
"""Return docs most similar to embedding vector with score and id.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding (str): Embedding to look up documents similar to.
|
|
|
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of (Document, score, id), the most similar to the query vector.
|
|
|
|
|
The list of (Document, score, id), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
await self._ensure_db_setup()
|
|
|
|
|
if not self.async_collection:
|
|
|
|
@ -833,6 +832,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float, str]]:
|
|
|
|
|
"""Return docs most similar to the query with score and id.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of (Document, score, id), the most similar to the query.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
|
|
|
return self.similarity_search_with_score_id_by_vector(
|
|
|
|
|
embedding=embedding_vector,
|
|
|
|
@ -846,6 +855,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float, str]]:
|
|
|
|
|
"""Return docs most similar to the query with score and id.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of (Document, score, id), the most similar to the query.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = await self.embedding.aembed_query(query)
|
|
|
|
|
return await self.asimilarity_search_with_score_id_by_vector(
|
|
|
|
|
embedding=embedding_vector,
|
|
|
|
@ -859,13 +878,15 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
"""Return docs most similar to embedding vector with score.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding (str): Embedding to look up documents similar to.
|
|
|
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of (Document, score), the most similar to the query vector.
|
|
|
|
|
The list of (Document, score), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
return [
|
|
|
|
|
(doc, score)
|
|
|
|
@ -882,13 +903,15 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
"""Return docs most similar to embedding vector with score.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding (str): Embedding to look up documents similar to.
|
|
|
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of (Document, score), the most similar to the query vector.
|
|
|
|
|
The list of (Document, score), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
return [
|
|
|
|
|
(doc, score)
|
|
|
|
@ -910,6 +933,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of Documents most similar to the query.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
|
|
|
return self.similarity_search_by_vector(
|
|
|
|
|
embedding_vector,
|
|
|
|
@ -924,6 +957,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to query.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of Documents most similar to the query.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = await self.embedding.aembed_query(query)
|
|
|
|
|
return await self.asimilarity_search_by_vector(
|
|
|
|
|
embedding_vector,
|
|
|
|
@ -938,6 +981,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of Documents most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
return [
|
|
|
|
|
doc
|
|
|
|
|
for doc, _ in self.similarity_search_with_score_by_vector(
|
|
|
|
@ -954,6 +1007,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of Documents most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
return [
|
|
|
|
|
doc
|
|
|
|
|
for doc, _ in await self.asimilarity_search_with_score_by_vector(
|
|
|
|
@ -969,6 +1032,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to query with score.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of (Document, score), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
|
|
|
return self.similarity_search_with_score_by_vector(
|
|
|
|
|
embedding_vector,
|
|
|
|
@ -982,6 +1055,16 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
k: int = 4,
|
|
|
|
|
filter: Optional[Dict[str, Any]] = None,
|
|
|
|
|
) -> List[Tuple[Document, float]]:
|
|
|
|
|
"""Return docs most similar to query with score.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
The list of (Document, score), the most similar to the query vector.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = await self.embedding.aembed_query(query)
|
|
|
|
|
return await self.asimilarity_search_with_score_by_vector(
|
|
|
|
|
embedding_vector,
|
|
|
|
@ -1022,17 +1105,21 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return.
|
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
|
The list of Documents selected by maximal marginal relevance.
|
|
|
|
|
"""
|
|
|
|
|
self._ensure_astra_db_client()
|
|
|
|
|
metadata_parameter = self._filter_to_metadata(filter)
|
|
|
|
@ -1064,17 +1151,21 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
embedding: Embedding to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return.
|
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
|
The list of Documents selected by maximal marginal relevance.
|
|
|
|
|
"""
|
|
|
|
|
await self._ensure_db_setup()
|
|
|
|
|
if not self.async_collection:
|
|
|
|
@ -1117,18 +1208,21 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
|
k (int = 4): Number of Documents to return.
|
|
|
|
|
fetch_k (int = 20): Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult (float = 0.5): Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
Optional.
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return.
|
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
|
The list of Documents selected by maximal marginal relevance.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
|
|
|
return self.max_marginal_relevance_search_by_vector(
|
|
|
|
@ -1149,18 +1243,21 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
**kwargs: Any,
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
|
|
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
|
|
|
among selected documents.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
query (str): Text to look up documents similar to.
|
|
|
|
|
k (int = 4): Number of Documents to return.
|
|
|
|
|
fetch_k (int = 20): Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult (float = 0.5): Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
Optional.
|
|
|
|
|
query: Query to look up documents similar to.
|
|
|
|
|
k: Number of Documents to return.
|
|
|
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
|
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
|
|
|
of diversity among the results with 0 corresponding
|
|
|
|
|
to maximum diversity and 1 to minimum diversity.
|
|
|
|
|
filter: Filter on the metadata to apply.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
List of Documents selected by maximal marginal relevance.
|
|
|
|
|
The list of Documents selected by maximal marginal relevance.
|
|
|
|
|
"""
|
|
|
|
|
embedding_vector = await self.embedding.aembed_query(query)
|
|
|
|
|
return await self.amax_marginal_relevance_search_by_vector(
|
|
|
|
@ -1239,12 +1336,12 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
"""Create an Astra DB vectorstore from raw texts.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
texts (List[str]): the texts to insert.
|
|
|
|
|
embedding (Embeddings): the embedding function to use in the store.
|
|
|
|
|
metadatas (Optional[List[dict]]): metadata dicts for the texts.
|
|
|
|
|
ids (Optional[List[str]]): ids to associate to the texts.
|
|
|
|
|
*Additional arguments*: you can pass any argument that you would
|
|
|
|
|
to 'add_texts' and/or to the 'AstraDBVectorStore' constructor
|
|
|
|
|
texts: the texts to insert.
|
|
|
|
|
embedding: the embedding function to use in the store.
|
|
|
|
|
metadatas: metadata dicts for the texts.
|
|
|
|
|
ids: ids to associate to the texts.
|
|
|
|
|
**kwargs: you can pass any argument that you would
|
|
|
|
|
to :meth:`~add_texts` and/or to the 'AstraDBVectorStore' constructor
|
|
|
|
|
(see these methods for details). These arguments will be
|
|
|
|
|
routed to the respective methods as they are.
|
|
|
|
|
|
|
|
|
@ -1274,12 +1371,12 @@ class AstraDBVectorStore(VectorStore):
|
|
|
|
|
"""Create an Astra DB vectorstore from raw texts.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
texts (List[str]): the texts to insert.
|
|
|
|
|
embedding (Embeddings): the embedding function to use in the store.
|
|
|
|
|
metadatas (Optional[List[dict]]): metadata dicts for the texts.
|
|
|
|
|
ids (Optional[List[str]]): ids to associate to the texts.
|
|
|
|
|
*Additional arguments*: you can pass any argument that you would
|
|
|
|
|
to 'add_texts' and/or to the 'AstraDBVectorStore' constructor
|
|
|
|
|
texts: the texts to insert.
|
|
|
|
|
embedding: the embedding function to use in the store.
|
|
|
|
|
metadatas: metadata dicts for the texts.
|
|
|
|
|
ids: ids to associate to the texts.
|
|
|
|
|
**kwargs: you can pass any argument that you would
|
|
|
|
|
to :meth:`~add_texts` and/or to the 'AstraDBVectorStore' constructor
|
|
|
|
|
(see these methods for details). These arguments will be
|
|
|
|
|
routed to the respective methods as they are.
|
|
|
|
|
|
|
|
|
|