Feature faiss delete (#8135)

<!-- Thank you for contributing to LangChain!

Replace this comment with:
- Description: docstore had two main method: add and search, however,
dealing with docstore sometimes requires deleting an entry from
docstore. So I have added a simple delete method that deletes items from
docstore. Additionally, I have added the delete method to faiss
vectorstore for the very same reason.
  - Issue: NA
  - Dependencies: NA
  - Tag maintainer:  @rlancemartin, @eyurtsev
- Twitter handle: we announce bigger features on Twitter. If your PR
gets announced and you'd like a mention, we'll gladly shout you out!

Please make sure you're PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
  2. an example notebook showing its use.

Maintainer responsibilities:
  - General / Misc / if you don't know who to tag: @baskaryan
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
  - Models / Prompts: @hwchase17, @baskaryan
  - Memory: @hwchase17
  - Agents / Tools / Toolkits: @hinthornw
  - Tracing / Callbacks: @agola11
  - Async: @agola11

If no one reviews your PR within a few days, feel free to @-mention the
same people again.

See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
 -->

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
fqassemi 2023-08-06 15:46:30 -07:00 committed by GitHub
parent b57fa1a39c
commit 485d716c21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 107 additions and 4 deletions

View File

@ -80,7 +80,7 @@
"source": [ "source": [
"from langchain.document_loaders import TextLoader\n", "from langchain.document_loaders import TextLoader\n",
"\n", "\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n", "loader = TextLoader(\"../../../extras/modules/state_of_the_union.txt\")\n",
"documents = loader.load()\n", "documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n", "docs = text_splitter.split_documents(documents)\n",
@ -90,7 +90,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 3,
"id": "5eabdb75", "id": "5eabdb75",
"metadata": { "metadata": {
"tags": [] "tags": []
@ -517,6 +517,67 @@
"for doc in results:\n", "for doc in results:\n",
" print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}\")" " print(f\"Content: {doc.page_content}, Metadata: {doc.metadata}\")"
] ]
},
{
"cell_type": "markdown",
"id": "1becca53",
"metadata": {},
"source": [
"## Delete\n",
"\n",
"You can also delete ids. Note that the ids to delete should be the ids in the docstore."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1408b870",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"db.delete([db.index_to_docstore_id[0]])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d13daf33",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Is now missing\n",
"0 in db.index_to_docstore_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30ace43e",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {

View File

@ -1,6 +1,6 @@
"""Interface to access to place that stores documents.""" """Interface to access to place that stores documents."""
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Union from typing import Dict, List, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -16,6 +16,10 @@ class Docstore(ABC):
If page does not exist, return similar entries. If page does not exist, return similar entries.
""" """
def delete(self, ids: List) -> None:
"""Deleting IDs from in memory dictionary."""
raise NotImplementedError
class AddableMixin(ABC): class AddableMixin(ABC):
"""Mixin class that supports adding texts.""" """Mixin class that supports adding texts."""

View File

@ -1,5 +1,5 @@
"""Simple in memory docstore in the form of a dict.""" """Simple in memory docstore in the form of a dict."""
from typing import Dict, Optional, Union from typing import Dict, List, Optional, Union
from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.base import AddableMixin, Docstore
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -26,6 +26,14 @@ class InMemoryDocstore(Docstore, AddableMixin):
raise ValueError(f"Tried to add ids that already exist: {overlapping}") raise ValueError(f"Tried to add ids that already exist: {overlapping}")
self._dict = {**self._dict, **texts} self._dict = {**self._dict, **texts}
def delete(self, ids: List) -> None:
"""Deleting IDs from in memory dictionary."""
overlapping = set(ids).intersection(self._dict)
if not overlapping:
raise ValueError(f"Tried to delete ids that does not exist: {ids}")
for _id in ids:
self._dict.pop(_id)
def search(self, search: str) -> Union[str, Document]: def search(self, search: str) -> Union[str, Document]:
"""Search via direct lookup. """Search via direct lookup.

View File

@ -468,6 +468,36 @@ class FAISS(VectorStore):
) )
return docs return docs
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
"""Delete by ID. These are the IDs in the vectorstore.
Args:
ids: List of ids to delete.
Returns:
Optional[bool]: True if deletion is successful,
False otherwise, None if not implemented.
"""
if ids is None:
raise ValueError("No ids provided to delete.")
overlapping = set(ids).intersection(self.index_to_docstore_id.values())
if not overlapping:
raise ValueError("ids do not exist in the current object")
_reversed_index = {v: k for k, v in self.index_to_docstore_id.items()}
index_to_delete = [_reversed_index[i] for i in ids]
# Removing ids from index.
self.index.remove_ids(np.array(index_to_delete, dtype=np.int64))
for _id in index_to_delete:
del self.index_to_docstore_id[_id]
# Remove items from docstore.
self.docstore.delete(ids)
return True
def merge_from(self, target: FAISS) -> None: def merge_from(self, target: FAISS) -> None:
"""Merge another FAISS object with the current one. """Merge another FAISS object with the current one.