Add delete and ensure add_texts performs upsert (w/ ID optional) (#6126)

## Goal We want to ensure consistency across vectordbs: 1/ add `delete` by ID method to the base vectorstore class 2/ ensure `add_texts` performs `upsert` with ID optionally passed ## Testing - [x] Pinecone: notebook test w/ `langchain_test` vectorstore. - [x] Chroma: Review by @jeffchuber, notebook test w/ in memory vectorstore. - [x] Supabase: Review by @copple, notebook test w/ `langchain_test` table. - [x] Weaviate: Notebook test w/ `langchain_test` index. - [x] Elastic: Revied by @vestal. Notebook test w/ `langchain_test` table. - [ ] Redis: Asked for review from owner of recent `delete` method https://github.com/hwchase17/langchain/pull/6222
2024-11-06 03:20:49 +00:00 · 2023-06-23 13:03:10 -07:00 · 2023-06-23 13:03:10 -07:00 · be02572d58
commit be02572d58
parent 393f469eb3
7 changed files with 115 additions and 20 deletions
--- a/langchain/vectorstores/base.py
+++ b/langchain/vectorstores/base.py
@ -48,6 +48,21 @@ class VectorStore(ABC):
            List of ids from adding the texts into the vectorstore.
        """

+    def delete(self, ids: List[str]) -> Optional[bool]:
+        """Delete by vector ID.
+
+        Args:
+            ids: List of ids to delete.
+
+        Returns:
+            Optional[bool]: True if deletion is successful,
+            False otherwise, None if not implemented.
+        """
+
+        raise NotImplementedError(
+            "delete_by_id method must be implemented by subclass."
+        )
+
    async def aadd_texts(
        self,
        texts: Iterable[str],
--- a/langchain/vectorstores/chroma.py
+++ b/langchain/vectorstores/chroma.py
@ -146,7 +146,7 @@ class Chroma(VectorStore):
        embeddings = None
        if self._embedding_function is not None:
            embeddings = self._embedding_function.embed_documents(list(texts))
-        self._collection.add(
+        self._collection.upsert(
            metadatas=metadatas, embeddings=embeddings, documents=texts, ids=ids
        )
        return ids
@ -442,3 +442,11 @@ class Chroma(VectorStore):
            client_settings=client_settings,
            client=client,
        )
+
+    def delete(self, ids: List[str]) -> None:
+        """Delete by vector IDs.
+
+        Args:
+            ids: List of ids to delete.
+        """
+        self._collection.delete(ids=ids)
--- a/langchain/vectorstores/elastic_vector_search.py
+++ b/langchain/vectorstores/elastic_vector_search.py
@ -158,6 +158,7 @@ class ElasticVectorSearch(VectorStore, ABC):
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        refresh_indices: bool = True,
+        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.
@ -179,7 +180,7 @@ class ElasticVectorSearch(VectorStore, ABC):
                "Please install it with `pip install elasticsearch`."
            )
        requests = []
-        ids = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
        embeddings = self.embedding.embed_documents(list(texts))
        dim = len(embeddings[0])
        mapping = _default_text_mapping(dim)
@ -194,16 +195,14 @@ class ElasticVectorSearch(VectorStore, ABC):

        for i, text in enumerate(texts):
            metadata = metadatas[i] if metadatas else {}
-            _id = str(uuid.uuid4())
            request = {
                "_op_type": "index",
                "_index": self.index_name,
                "vector": embeddings[i],
                "text": text,
                "metadata": metadata,
-                "_id": _id,
+                "_id": ids[i],
            }
-            ids.append(_id)
            requests.append(request)
        bulk(self.client, requests)

@ -318,6 +317,17 @@ class ElasticVectorSearch(VectorStore, ABC):
            )
        return response

+    def delete(self, ids: List[str]) -> None:
+        """Delete by vector IDs.
+
+        Args:
+            ids: List of ids to delete.
+        """
+
+        # TODO: Check if this can be done in bulk
+        for id in ids:
+            self.client.delete(index=self.index_name, id=id)
+

 class ElasticKnnSearch(ElasticVectorSearch):
    """
--- a/langchain/vectorstores/pinecone.py
+++ b/langchain/vectorstores/pinecone.py
@ -353,3 +353,16 @@ class Pinecone(VectorStore):
        return cls(
            pinecone.Index(index_name), embedding.embed_query, text_key, namespace
        )
+
+    def delete(self, ids: List[str]) -> None:
+        """Delete by vector IDs.
+
+        Args:
+            ids: List of ids to delete.
+        """
+
+        # This is the maximum number of IDs that can be deleted
+        chunk_size = 1000
+        for i in range(0, len(ids), chunk_size):
+            chunk = ids[i : i + chunk_size]
+            self._index.delete(ids=chunk)
--- a/langchain/vectorstores/redis.py
+++ b/langchain/vectorstores/redis.py
@ -187,7 +187,6 @@ class Redis(VectorStore):
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        embeddings: Optional[List[List[float]]] = None,
-        keys: Optional[List[str]] = None,
        batch_size: int = 1000,
        **kwargs: Any,
    ) -> List[str]:
@ -199,7 +198,7 @@ class Redis(VectorStore):
                Defaults to None.
            embeddings (Optional[List[List[float]]], optional): Optional pre-generated
                embeddings. Defaults to None.
-            keys (Optional[List[str]], optional): Optional key values to use as ids.
+            keys (List[str]) or ids (List[str]): Identifiers of entries.
                Defaults to None.
            batch_size (int, optional): Batch size to use for writes. Defaults to 1000.

@ -209,11 +208,15 @@ class Redis(VectorStore):
        ids = []
        prefix = _redis_prefix(self.index_name)

+        # Get keys or ids from kwargs
+        # Other vectorstores use ids
+        keys_or_ids = kwargs.get("keys", kwargs.get("ids"))
+
        # Write data to redis
        pipeline = self.client.pipeline(transaction=False)
        for i, text in enumerate(texts):
            # Use provided values by default or fallback
-            key = keys[i] if keys else _redis_key(prefix)
+            key = keys_or_ids[i] if keys_or_ids else _redis_key(prefix)
            metadata = metadatas[i] if metadatas else {}
            embedding = embeddings[i] if embeddings else self.embedding_function(text)
            pipeline.hset(
@ -461,19 +464,23 @@ class Redis(VectorStore):

    @staticmethod
    def delete(
-        keys: List[str],
+        ids: List[str],
        **kwargs: Any,
    ) -> bool:
        """
        Delete a Redis entry.

        Args:
-            keys (List[str]): Keys of entries to delete.
+            ids: List of ids (keys) to delete.

        Returns:
            bool: Whether or not the deletions were successful.
        """
        redis_url = get_from_dict_or_env(kwargs, "redis_url", "REDIS_URL")
+
+        if ids is None:
+            raise ValueError("'ids' (keys)() were not provided.")
+
        try:
            import redis
        except ImportError:
@ -491,11 +498,11 @@ class Redis(VectorStore):
            raise ValueError(f"Your redis connected error: {e}")
        # Check if index exists
        try:
-            client.delete(*keys)
+            client.delete(*ids)
            logger.info("Entries deleted")
            return True
        except:  # noqa: E722
-            # Keys not exist
+            # ids does not exist
            return False

    @staticmethod
--- a/langchain/vectorstores/supabase.py
+++ b/langchain/vectorstores/supabase.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import uuid
 from itertools import repeat
 from typing import (
    TYPE_CHECKING,
@ -70,12 +71,14 @@ class SupabaseVectorStore(VectorStore):
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict[Any, Any]]] = None,
+        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> List[str]:
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
        docs = self._texts_to_documents(texts, metadatas)

        vectors = self._embedding.embed_documents(list(texts))
-        return self.add_vectors(vectors, docs)
+        return self.add_vectors(vectors, docs, ids)

    @classmethod
    def from_texts(
@ -86,6 +89,7 @@ class SupabaseVectorStore(VectorStore):
        client: Optional[supabase.client.Client] = None,
        table_name: Optional[str] = "documents",
        query_name: Union[str, None] = "match_documents",
+        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> "SupabaseVectorStore":
        """Return VectorStore initialized from texts and embeddings."""
@ -97,8 +101,9 @@ class SupabaseVectorStore(VectorStore):
            raise ValueError("Supabase document table_name is required.")

        embeddings = embedding.embed_documents(texts)
+        ids = [str(uuid.uuid4()) for _ in texts]
        docs = cls._texts_to_documents(texts, metadatas)
-        _ids = cls._add_vectors(client, table_name, embeddings, docs)
+        _ids = cls._add_vectors(client, table_name, embeddings, docs, ids)

        return cls(
            client=client,
@ -108,9 +113,12 @@ class SupabaseVectorStore(VectorStore):
        )

    def add_vectors(
-        self, vectors: List[List[float]], documents: List[Document]
+        self,
+        vectors: List[List[float]],
+        documents: List[Document],
+        ids: List[str],
    ) -> List[str]:
-        return self._add_vectors(self._client, self.table_name, vectors, documents)
+        return self._add_vectors(self._client, self.table_name, vectors, documents, ids)

    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
@ -200,11 +208,13 @@ class SupabaseVectorStore(VectorStore):
        table_name: str,
        vectors: List[List[float]],
        documents: List[Document],
+        ids: List[str],
    ) -> List[str]:
        """Add vectors to Supabase table."""

        rows: List[dict[str, Any]] = [
            {
+                "id": ids[idx],
                "content": documents[idx].page_content,
                "embedding": embedding,
                "metadata": documents[idx].metadata,  # type: ignore
@ -219,7 +229,7 @@ class SupabaseVectorStore(VectorStore):
        for i in range(0, len(rows), chunk_size):
            chunk = rows[i : i + chunk_size]

-            result = client.from_(table_name).insert(chunk).execute()  # type: ignore
+            result = client.from_(table_name).upsert(chunk).execute()  # type: ignore

            if len(result.data) == 0:
                raise Exception("Error inserting: No rows added")
@ -335,3 +345,20 @@ class SupabaseVectorStore(VectorStore):
            embedding[0], k, fetch_k, lambda_mult=lambda_mult
        )
        return docs
+
+    def delete(self, ids: List[str]) -> None:
+        """Delete by vector IDs.
+
+        Args:
+            ids: List of ids to delete.
+        """
+        rows: List[dict[str, Any]] = [
+            {
+                "id": id,
+            }
+            for id in ids
+        ]
+
+        # TODO: Check if this can be done in bulk
+        for row in rows:
+            self._client.from_(self.table_name).delete().eq("id", row["id"]).execute()
--- a/langchain/vectorstores/weaviate.py
+++ b/langchain/vectorstores/weaviate.py
@ -135,11 +135,15 @@ class Weaviate(VectorStore):
                    for key, val in metadatas[i].items():
                        data_properties[key] = _json_serializable(val)

+                # Allow for ids (consistent w/ other methods)
+                # # Or uuids (backwards compatble w/ existing arg)
                # If the UUID of one of the objects already exists
                # then the existing object will be replaced by the new object.
-                _id = (
-                    kwargs["uuids"][i] if "uuids" in kwargs else get_valid_uuid(uuid4())
-                )
+                _id = get_valid_uuid(uuid4())
+                if "uuids" in kwargs:
+                    _id = kwargs["uuids"][i]
+                elif "ids" in kwargs:
+                    _id = kwargs["ids"][i]

                if self._embedding is not None:
                    vector = self._embedding.embed_documents([text])[0]
@ -465,3 +469,14 @@ class Weaviate(VectorStore):
            relevance_score_fn=relevance_score_fn,
            by_text=by_text,
        )
+
+    def delete(self, ids: List[str]) -> None:
+        """Delete by vector IDs.
+
+        Args:
+            ids: List of ids to delete.
+        """
+
+        # TODO: Check if this can be done in bulk
+        for id in ids:
+            self._client.data_object.delete(uuid=id)