Add delete and ensure add_texts performs upsert (w/ ID optional) (#6126)

## Goal 

We want to ensure consistency across vectordbs:
1/ add `delete` by ID method to the base vectorstore class
2/ ensure `add_texts` performs `upsert` with ID optionally passed

## Testing
- [x] Pinecone: notebook test w/ `langchain_test` vectorstore.
- [x] Chroma: Review by @jeffchuber, notebook test w/ in memory
vectorstore.
- [x] Supabase: Review by @copple, notebook test w/ `langchain_test`
table.
- [x] Weaviate: Notebook test w/ `langchain_test` index. 
- [x] Elastic: Revied by @vestal. Notebook test w/ `langchain_test`
table.
- [ ] Redis: Asked for review from owner of recent `delete` method
https://github.com/hwchase17/langchain/pull/6222
This commit is contained in:
Lance Martin 2023-06-23 13:03:10 -07:00 committed by GitHub
parent 393f469eb3
commit be02572d58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 115 additions and 20 deletions

View File

@ -48,6 +48,21 @@ class VectorStore(ABC):
List of ids from adding the texts into the vectorstore.
"""
def delete(self, ids: List[str]) -> Optional[bool]:
"""Delete by vector ID.
Args:
ids: List of ids to delete.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
raise NotImplementedError(
"delete_by_id method must be implemented by subclass."
)
async def aadd_texts(
self,
texts: Iterable[str],

View File

@ -146,7 +146,7 @@ class Chroma(VectorStore):
embeddings = None
if self._embedding_function is not None:
embeddings = self._embedding_function.embed_documents(list(texts))
self._collection.add(
self._collection.upsert(
metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
)
return ids
@ -442,3 +442,11 @@ class Chroma(VectorStore):
client_settings=client_settings,
client=client,
)
def delete(self, ids: List[str]) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
self._collection.delete(ids=ids)

View File

@ -158,6 +158,7 @@ class ElasticVectorSearch(VectorStore, ABC):
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
refresh_indices: bool = True,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
@ -179,7 +180,7 @@ class ElasticVectorSearch(VectorStore, ABC):
"Please install it with `pip install elasticsearch`."
)
requests = []
ids = []
ids = ids or [str(uuid.uuid4()) for _ in texts]
embeddings = self.embedding.embed_documents(list(texts))
dim = len(embeddings[0])
mapping = _default_text_mapping(dim)
@ -194,16 +195,14 @@ class ElasticVectorSearch(VectorStore, ABC):
for i, text in enumerate(texts):
metadata = metadatas[i] if metadatas else {}
_id = str(uuid.uuid4())
request = {
"_op_type": "index",
"_index": self.index_name,
"vector": embeddings[i],
"text": text,
"metadata": metadata,
"_id": _id,
"_id": ids[i],
}
ids.append(_id)
requests.append(request)
bulk(self.client, requests)
@ -318,6 +317,17 @@ class ElasticVectorSearch(VectorStore, ABC):
)
return response
def delete(self, ids: List[str]) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
# TODO: Check if this can be done in bulk
for id in ids:
self.client.delete(index=self.index_name, id=id)
class ElasticKnnSearch(ElasticVectorSearch):
"""

View File

@ -353,3 +353,16 @@ class Pinecone(VectorStore):
return cls(
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
)
def delete(self, ids: List[str]) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
# This is the maximum number of IDs that can be deleted
chunk_size = 1000
for i in range(0, len(ids), chunk_size):
chunk = ids[i : i + chunk_size]
self._index.delete(ids=chunk)

View File

@ -187,7 +187,6 @@ class Redis(VectorStore):
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
embeddings: Optional[List[List[float]]] = None,
keys: Optional[List[str]] = None,
batch_size: int = 1000,
**kwargs: Any,
) -> List[str]:
@ -199,7 +198,7 @@ class Redis(VectorStore):
Defaults to None.
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
embeddings. Defaults to None.
keys (Optional[List[str]], optional): Optional key values to use as ids.
keys (List[str]) or ids (List[str]): Identifiers of entries.
Defaults to None.
batch_size (int, optional): Batch size to use for writes. Defaults to 1000.
@ -209,11 +208,15 @@ class Redis(VectorStore):
ids = []
prefix = _redis_prefix(self.index_name)
# Get keys or ids from kwargs
# Other vectorstores use ids
keys_or_ids = kwargs.get("keys", kwargs.get("ids"))
# Write data to redis
pipeline = self.client.pipeline(transaction=False)
for i, text in enumerate(texts):
# Use provided values by default or fallback
key = keys[i] if keys else _redis_key(prefix)
key = keys_or_ids[i] if keys_or_ids else _redis_key(prefix)
metadata = metadatas[i] if metadatas else {}
embedding = embeddings[i] if embeddings else self.embedding_function(text)
pipeline.hset(
@ -461,19 +464,23 @@ class Redis(VectorStore):
@staticmethod
def delete(
keys: List[str],
ids: List[str],
**kwargs: Any,
) -> bool:
"""
Delete a Redis entry.
Args:
keys (List[str]): Keys of entries to delete.
ids: List of ids (keys) to delete.
Returns:
bool: Whether or not the deletions were successful.
"""
redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL")
if ids is None:
raise ValueError("'ids' (keys)() were not provided.")
try:
import redis
except ImportError:
@ -491,11 +498,11 @@ class Redis(VectorStore):
raise ValueError(f"Your redis connected error: {e}")
# Check if index exists
try:
client.delete(*keys)
client.delete(*ids)
logger.info("Entries deleted")
return True
except: # noqa: E722
# Keys not exist
# ids does not exist
return False
@staticmethod

View File

@ -1,5 +1,6 @@
from __future__ import annotations
import uuid
from itertools import repeat
from typing import (
TYPE_CHECKING,
@ -70,12 +71,14 @@ class SupabaseVectorStore(VectorStore):
self,
texts: Iterable[str],
metadatas: Optional[List[dict[Any, Any]]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
ids = ids or [str(uuid.uuid4()) for _ in texts]
docs = self._texts_to_documents(texts, metadatas)
vectors = self._embedding.embed_documents(list(texts))
return self.add_vectors(vectors, docs)
return self.add_vectors(vectors, docs, ids)
@classmethod
def from_texts(
@ -86,6 +89,7 @@ class SupabaseVectorStore(VectorStore):
client: Optional[supabase.client.Client] = None,
table_name: Optional[str] = "documents",
query_name: Union[str, None] = "match_documents",
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> "SupabaseVectorStore":
"""Return VectorStore initialized from texts and embeddings."""
@ -97,8 +101,9 @@ class SupabaseVectorStore(VectorStore):
raise ValueError("Supabase document table_name is required.")
embeddings = embedding.embed_documents(texts)
ids = [str(uuid.uuid4()) for _ in texts]
docs = cls._texts_to_documents(texts, metadatas)
_ids = cls._add_vectors(client, table_name, embeddings, docs)
_ids = cls._add_vectors(client, table_name, embeddings, docs, ids)
return cls(
client=client,
@ -108,9 +113,12 @@ class SupabaseVectorStore(VectorStore):
)
def add_vectors(
self, vectors: List[List[float]], documents: List[Document]
self,
vectors: List[List[float]],
documents: List[Document],
ids: List[str],
) -> List[str]:
return self._add_vectors(self._client, self.table_name, vectors, documents)
return self._add_vectors(self._client, self.table_name, vectors, documents, ids)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
@ -200,11 +208,13 @@ class SupabaseVectorStore(VectorStore):
table_name: str,
vectors: List[List[float]],
documents: List[Document],
ids: List[str],
) -> List[str]:
"""Add vectors to Supabase table."""
rows: List[dict[str, Any]] = [
{
"id": ids[idx],
"content": documents[idx].page_content,
"embedding": embedding,
"metadata": documents[idx].metadata, # type: ignore
@ -219,7 +229,7 @@ class SupabaseVectorStore(VectorStore):
for i in range(0, len(rows), chunk_size):
chunk = rows[i : i + chunk_size]
result = client.from_(table_name).insert(chunk).execute() # type: ignore
result = client.from_(table_name).upsert(chunk).execute() # type: ignore
if len(result.data) == 0:
raise Exception("Error inserting: No rows added")
@ -335,3 +345,20 @@ class SupabaseVectorStore(VectorStore):
embedding[0], k, fetch_k, lambda_mult=lambda_mult
)
return docs
def delete(self, ids: List[str]) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
rows: List[dict[str, Any]] = [
{
"id": id,
}
for id in ids
]
# TODO: Check if this can be done in bulk
for row in rows:
self._client.from_(self.table_name).delete().eq("id", row["id"]).execute()

View File

@ -135,11 +135,15 @@ class Weaviate(VectorStore):
for key, val in metadatas[i].items():
data_properties[key] = _json_serializable(val)
# Allow for ids (consistent w/ other methods)
# # Or uuids (backwards compatble w/ existing arg)
# If the UUID of one of the objects already exists
# then the existing object will be replaced by the new object.
_id = (
kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4())
)
_id = get_valid_uuid(uuid4())
if "uuids" in kwargs:
_id = kwargs["uuids"][i]
elif "ids" in kwargs:
_id = kwargs["ids"][i]
if self._embedding is not None:
vector = self._embedding.embed_documents([text])[0]
@ -465,3 +469,14 @@ class Weaviate(VectorStore):
relevance_score_fn=relevance_score_fn,
by_text=by_text,
)
def delete(self, ids: List[str]) -> None:
"""Delete by vector IDs.
Args:
ids: List of ids to delete.
"""
# TODO: Check if this can be done in bulk
for id in ids:
self._client.data_object.delete(uuid=id)