mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add delete and ensure add_texts performs upsert (w/ ID optional) (#6126)
## Goal We want to ensure consistency across vectordbs: 1/ add `delete` by ID method to the base vectorstore class 2/ ensure `add_texts` performs `upsert` with ID optionally passed ## Testing - [x] Pinecone: notebook test w/ `langchain_test` vectorstore. - [x] Chroma: Review by @jeffchuber, notebook test w/ in memory vectorstore. - [x] Supabase: Review by @copple, notebook test w/ `langchain_test` table. - [x] Weaviate: Notebook test w/ `langchain_test` index. - [x] Elastic: Revied by @vestal. Notebook test w/ `langchain_test` table. - [ ] Redis: Asked for review from owner of recent `delete` method https://github.com/hwchase17/langchain/pull/6222
This commit is contained in:
parent
393f469eb3
commit
be02572d58
@ -48,6 +48,21 @@ class VectorStore(ABC):
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
|
||||
def delete(self, ids: List[str]) -> Optional[bool]:
|
||||
"""Delete by vector ID.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
|
||||
Returns:
|
||||
Optional[bool]: True if deletion is successful,
|
||||
False otherwise, None if not implemented.
|
||||
"""
|
||||
|
||||
raise NotImplementedError(
|
||||
"delete_by_id method must be implemented by subclass."
|
||||
)
|
||||
|
||||
async def aadd_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
|
@ -146,7 +146,7 @@ class Chroma(VectorStore):
|
||||
embeddings = None
|
||||
if self._embedding_function is not None:
|
||||
embeddings = self._embedding_function.embed_documents(list(texts))
|
||||
self._collection.add(
|
||||
self._collection.upsert(
|
||||
metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
|
||||
)
|
||||
return ids
|
||||
@ -442,3 +442,11 @@ class Chroma(VectorStore):
|
||||
client_settings=client_settings,
|
||||
client=client,
|
||||
)
|
||||
|
||||
def delete(self, ids: List[str]) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
self._collection.delete(ids=ids)
|
||||
|
@ -158,6 +158,7 @@ class ElasticVectorSearch(VectorStore, ABC):
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
refresh_indices: bool = True,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
@ -179,7 +180,7 @@ class ElasticVectorSearch(VectorStore, ABC):
|
||||
"Please install it with `pip install elasticsearch`."
|
||||
)
|
||||
requests = []
|
||||
ids = []
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
embeddings = self.embedding.embed_documents(list(texts))
|
||||
dim = len(embeddings[0])
|
||||
mapping = _default_text_mapping(dim)
|
||||
@ -194,16 +195,14 @@ class ElasticVectorSearch(VectorStore, ABC):
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
_id = str(uuid.uuid4())
|
||||
request = {
|
||||
"_op_type": "index",
|
||||
"_index": self.index_name,
|
||||
"vector": embeddings[i],
|
||||
"text": text,
|
||||
"metadata": metadata,
|
||||
"_id": _id,
|
||||
"_id": ids[i],
|
||||
}
|
||||
ids.append(_id)
|
||||
requests.append(request)
|
||||
bulk(self.client, requests)
|
||||
|
||||
@ -318,6 +317,17 @@ class ElasticVectorSearch(VectorStore, ABC):
|
||||
)
|
||||
return response
|
||||
|
||||
def delete(self, ids: List[str]) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
|
||||
# TODO: Check if this can be done in bulk
|
||||
for id in ids:
|
||||
self.client.delete(index=self.index_name, id=id)
|
||||
|
||||
|
||||
class ElasticKnnSearch(ElasticVectorSearch):
|
||||
"""
|
||||
|
@ -353,3 +353,16 @@ class Pinecone(VectorStore):
|
||||
return cls(
|
||||
pinecone.Index(index_name), embedding.embed_query, text_key, namespace
|
||||
)
|
||||
|
||||
def delete(self, ids: List[str]) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
|
||||
# This is the maximum number of IDs that can be deleted
|
||||
chunk_size = 1000
|
||||
for i in range(0, len(ids), chunk_size):
|
||||
chunk = ids[i : i + chunk_size]
|
||||
self._index.delete(ids=chunk)
|
||||
|
@ -187,7 +187,6 @@ class Redis(VectorStore):
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
embeddings: Optional[List[List[float]]] = None,
|
||||
keys: Optional[List[str]] = None,
|
||||
batch_size: int = 1000,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
@ -199,7 +198,7 @@ class Redis(VectorStore):
|
||||
Defaults to None.
|
||||
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
|
||||
embeddings. Defaults to None.
|
||||
keys (Optional[List[str]], optional): Optional key values to use as ids.
|
||||
keys (List[str]) or ids (List[str]): Identifiers of entries.
|
||||
Defaults to None.
|
||||
batch_size (int, optional): Batch size to use for writes. Defaults to 1000.
|
||||
|
||||
@ -209,11 +208,15 @@ class Redis(VectorStore):
|
||||
ids = []
|
||||
prefix = _redis_prefix(self.index_name)
|
||||
|
||||
# Get keys or ids from kwargs
|
||||
# Other vectorstores use ids
|
||||
keys_or_ids = kwargs.get("keys", kwargs.get("ids"))
|
||||
|
||||
# Write data to redis
|
||||
pipeline = self.client.pipeline(transaction=False)
|
||||
for i, text in enumerate(texts):
|
||||
# Use provided values by default or fallback
|
||||
key = keys[i] if keys else _redis_key(prefix)
|
||||
key = keys_or_ids[i] if keys_or_ids else _redis_key(prefix)
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
embedding = embeddings[i] if embeddings else self.embedding_function(text)
|
||||
pipeline.hset(
|
||||
@ -461,19 +464,23 @@ class Redis(VectorStore):
|
||||
|
||||
@staticmethod
|
||||
def delete(
|
||||
keys: List[str],
|
||||
ids: List[str],
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
"""
|
||||
Delete a Redis entry.
|
||||
|
||||
Args:
|
||||
keys (List[str]): Keys of entries to delete.
|
||||
ids: List of ids (keys) to delete.
|
||||
|
||||
Returns:
|
||||
bool: Whether or not the deletions were successful.
|
||||
"""
|
||||
redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL")
|
||||
|
||||
if ids is None:
|
||||
raise ValueError("'ids' (keys)() were not provided.")
|
||||
|
||||
try:
|
||||
import redis
|
||||
except ImportError:
|
||||
@ -491,11 +498,11 @@ class Redis(VectorStore):
|
||||
raise ValueError(f"Your redis connected error: {e}")
|
||||
# Check if index exists
|
||||
try:
|
||||
client.delete(*keys)
|
||||
client.delete(*ids)
|
||||
logger.info("Entries deleted")
|
||||
return True
|
||||
except: # noqa: E722
|
||||
# Keys not exist
|
||||
# ids does not exist
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
|
@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from itertools import repeat
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
@ -70,12 +71,14 @@ class SupabaseVectorStore(VectorStore):
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict[Any, Any]]] = None,
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
ids = ids or [str(uuid.uuid4()) for _ in texts]
|
||||
docs = self._texts_to_documents(texts, metadatas)
|
||||
|
||||
vectors = self._embedding.embed_documents(list(texts))
|
||||
return self.add_vectors(vectors, docs)
|
||||
return self.add_vectors(vectors, docs, ids)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
@ -86,6 +89,7 @@ class SupabaseVectorStore(VectorStore):
|
||||
client: Optional[supabase.client.Client] = None,
|
||||
table_name: Optional[str] = "documents",
|
||||
query_name: Union[str, None] = "match_documents",
|
||||
ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "SupabaseVectorStore":
|
||||
"""Return VectorStore initialized from texts and embeddings."""
|
||||
@ -97,8 +101,9 @@ class SupabaseVectorStore(VectorStore):
|
||||
raise ValueError("Supabase document table_name is required.")
|
||||
|
||||
embeddings = embedding.embed_documents(texts)
|
||||
ids = [str(uuid.uuid4()) for _ in texts]
|
||||
docs = cls._texts_to_documents(texts, metadatas)
|
||||
_ids = cls._add_vectors(client, table_name, embeddings, docs)
|
||||
_ids = cls._add_vectors(client, table_name, embeddings, docs, ids)
|
||||
|
||||
return cls(
|
||||
client=client,
|
||||
@ -108,9 +113,12 @@ class SupabaseVectorStore(VectorStore):
|
||||
)
|
||||
|
||||
def add_vectors(
|
||||
self, vectors: List[List[float]], documents: List[Document]
|
||||
self,
|
||||
vectors: List[List[float]],
|
||||
documents: List[Document],
|
||||
ids: List[str],
|
||||
) -> List[str]:
|
||||
return self._add_vectors(self._client, self.table_name, vectors, documents)
|
||||
return self._add_vectors(self._client, self.table_name, vectors, documents, ids)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
@ -200,11 +208,13 @@ class SupabaseVectorStore(VectorStore):
|
||||
table_name: str,
|
||||
vectors: List[List[float]],
|
||||
documents: List[Document],
|
||||
ids: List[str],
|
||||
) -> List[str]:
|
||||
"""Add vectors to Supabase table."""
|
||||
|
||||
rows: List[dict[str, Any]] = [
|
||||
{
|
||||
"id": ids[idx],
|
||||
"content": documents[idx].page_content,
|
||||
"embedding": embedding,
|
||||
"metadata": documents[idx].metadata, # type: ignore
|
||||
@ -219,7 +229,7 @@ class SupabaseVectorStore(VectorStore):
|
||||
for i in range(0, len(rows), chunk_size):
|
||||
chunk = rows[i : i + chunk_size]
|
||||
|
||||
result = client.from_(table_name).insert(chunk).execute() # type: ignore
|
||||
result = client.from_(table_name).upsert(chunk).execute() # type: ignore
|
||||
|
||||
if len(result.data) == 0:
|
||||
raise Exception("Error inserting: No rows added")
|
||||
@ -335,3 +345,20 @@ class SupabaseVectorStore(VectorStore):
|
||||
embedding[0], k, fetch_k, lambda_mult=lambda_mult
|
||||
)
|
||||
return docs
|
||||
|
||||
def delete(self, ids: List[str]) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
rows: List[dict[str, Any]] = [
|
||||
{
|
||||
"id": id,
|
||||
}
|
||||
for id in ids
|
||||
]
|
||||
|
||||
# TODO: Check if this can be done in bulk
|
||||
for row in rows:
|
||||
self._client.from_(self.table_name).delete().eq("id", row["id"]).execute()
|
||||
|
@ -135,11 +135,15 @@ class Weaviate(VectorStore):
|
||||
for key, val in metadatas[i].items():
|
||||
data_properties[key] = _json_serializable(val)
|
||||
|
||||
# Allow for ids (consistent w/ other methods)
|
||||
# # Or uuids (backwards compatble w/ existing arg)
|
||||
# If the UUID of one of the objects already exists
|
||||
# then the existing object will be replaced by the new object.
|
||||
_id = (
|
||||
kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4())
|
||||
)
|
||||
_id = get_valid_uuid(uuid4())
|
||||
if "uuids" in kwargs:
|
||||
_id = kwargs["uuids"][i]
|
||||
elif "ids" in kwargs:
|
||||
_id = kwargs["ids"][i]
|
||||
|
||||
if self._embedding is not None:
|
||||
vector = self._embedding.embed_documents([text])[0]
|
||||
@ -465,3 +469,14 @@ class Weaviate(VectorStore):
|
||||
relevance_score_fn=relevance_score_fn,
|
||||
by_text=by_text,
|
||||
)
|
||||
|
||||
def delete(self, ids: List[str]) -> None:
|
||||
"""Delete by vector IDs.
|
||||
|
||||
Args:
|
||||
ids: List of ids to delete.
|
||||
"""
|
||||
|
||||
# TODO: Check if this can be done in bulk
|
||||
for id in ids:
|
||||
self._client.data_object.delete(uuid=id)
|
||||
|
Loading…
Reference in New Issue
Block a user