From 485d716c21d8e40c13d4b1a53b6dd6beb8c37ae4 Mon Sep 17 00:00:00 2001 From: fqassemi Date: Sun, 6 Aug 2023 15:46:30 -0700 Subject: [PATCH] Feature faiss delete (#8135) --------- Co-authored-by: Harrison Chase --- .../integrations/vectorstores/faiss.ipynb | 65 ++++++++++++++++++- libs/langchain/langchain/docstore/base.py | 6 +- .../langchain/langchain/docstore/in_memory.py | 10 ++- .../langchain/langchain/vectorstores/faiss.py | 30 +++++++++ 4 files changed, 107 insertions(+), 4 deletions(-) diff --git a/docs/extras/integrations/vectorstores/faiss.ipynb b/docs/extras/integrations/vectorstores/faiss.ipynb index 7fb3ff7e56..7a355c3732 100644 --- a/docs/extras/integrations/vectorstores/faiss.ipynb +++ b/docs/extras/integrations/vectorstores/faiss.ipynb @@ -80,7 +80,7 @@ "source": [ "from langchain.document_loaders import TextLoader\n", "\n", - "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "id": "5eabdb75", "metadata": { "tags": [] @@ -517,6 +517,67 @@ "for doc in results:\n", " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}\")" ] + }, + { + "cell_type": "markdown", + "id": "1becca53", + "metadata": {}, + "source": [ + "## Delete\n", + "\n", + "You can also delete ids. Note that the ids to delete should be the ids in the docstore." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1408b870", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.delete([db.index_to_docstore_id[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d13daf33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Is now missing\n", + "0 in db.index_to_docstore_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30ace43e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/libs/langchain/langchain/docstore/base.py b/libs/langchain/langchain/docstore/base.py index 4a91680c73..b4f366fbaa 100644 --- a/libs/langchain/langchain/docstore/base.py +++ b/libs/langchain/langchain/docstore/base.py @@ -1,6 +1,6 @@ """Interface to access to place that stores documents.""" from abc import ABC, abstractmethod -from typing import Dict, Union +from typing import Dict, List, Union from langchain.docstore.document import Document @@ -16,6 +16,10 @@ class Docstore(ABC): If page does not exist, return similar entries. """ + def delete(self, ids: List) -> None: + """Deleting IDs from in memory dictionary.""" + raise NotImplementedError + class AddableMixin(ABC): """Mixin class that supports adding texts.""" diff --git a/libs/langchain/langchain/docstore/in_memory.py b/libs/langchain/langchain/docstore/in_memory.py index 96b8a52ffa..6832483dd2 100644 --- a/libs/langchain/langchain/docstore/in_memory.py +++ b/libs/langchain/langchain/docstore/in_memory.py @@ -1,5 +1,5 @@ """Simple in memory docstore in the form of a dict.""" -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document @@ -26,6 +26,14 @@ class InMemoryDocstore(Docstore, AddableMixin): raise ValueError(f"Tried to add ids that already exist: {overlapping}") self._dict = {**self._dict, **texts} + def delete(self, ids: List) -> None: + """Deleting IDs from in memory dictionary.""" + overlapping = set(ids).intersection(self._dict) + if not overlapping: + raise ValueError(f"Tried to delete ids that does not exist: {ids}") + for _id in ids: + self._dict.pop(_id) + def search(self, search: str) -> Union[str, Document]: """Search via direct lookup. diff --git a/libs/langchain/langchain/vectorstores/faiss.py b/libs/langchain/langchain/vectorstores/faiss.py index a5f1ba4b61..898a96f155 100644 --- a/libs/langchain/langchain/vectorstores/faiss.py +++ b/libs/langchain/langchain/vectorstores/faiss.py @@ -468,6 +468,36 @@ class FAISS(VectorStore): ) return docs + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by ID. These are the IDs in the vectorstore. + + Args: + ids: List of ids to delete. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + if ids is None: + raise ValueError("No ids provided to delete.") + + overlapping = set(ids).intersection(self.index_to_docstore_id.values()) + if not overlapping: + raise ValueError("ids do not exist in the current object") + + _reversed_index = {v: k for k, v in self.index_to_docstore_id.items()} + + index_to_delete = [_reversed_index[i] for i in ids] + + # Removing ids from index. + self.index.remove_ids(np.array(index_to_delete, dtype=np.int64)) + for _id in index_to_delete: + del self.index_to_docstore_id[_id] + + # Remove items from docstore. + self.docstore.delete(ids) + return True + def merge_from(self, target: FAISS) -> None: """Merge another FAISS object with the current one.