index rename delete_mode -> cleanup (#10103)

pull/10109/head
Bagatur 1 year ago committed by GitHub
parent 427f696fb0
commit 71c418725f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -34,7 +34,7 @@
"\n",
"When indexing documents into a vector store, it's possible that some existing documents in the vector store should be deleted. In certain situations you may want to remove any existing documents that are derived from the same sources as the new documents being indexed. In others you may want to delete all existing documents wholesale. The indexing API deletion modes let you pick the behavior you want:\n",
"\n",
"| Delete Mode | De-Duplicates Content | Parallelizable | Cleans Up Deleted Source Docs | Cleans Up Mutations of Source Docs and/or Derived Docs | Clean Up Timing |\n",
"| Cleanup Mode | De-Duplicates Content | Parallelizable | Cleans Up Deleted Source Docs | Cleans Up Mutations of Source Docs and/or Derived Docs | Clean Up Timing |\n",
"|-------------|-----------------------|---------------|----------------------------------|----------------------------------------------------|---------------------|\n",
"| None | ✅ | ✅ | ❌ | ❌ | - |\n",
"| Incremental | ✅ | ✅ | ❌ | ✅ | Continuously |\n",
@ -46,7 +46,7 @@
"`incremental` and `full` offer the following automated clean up:\n",
"\n",
"* If the content of source document or derived documents has **changed**, both `incremental` or `full` modes will clean up (delete) previous versions of the content.\n",
"* If the source document has been **deleted** (meaning it is not included in the documents currently being indexed), the `full` delete mode will delete it from the vector store correctly, but the `incremental` mode will not.\n",
"* If the source document has been **deleted** (meaning it is not included in the documents currently being indexed), the `full` cleanup mode will delete it from the vector store correctly, but the `incremental` mode will not.\n",
"\n",
"When content is mutated (e.g., the source PDF file was revised) there will be a period of time during indexing when both the new and old versions may be returned to the user. This happens after the new content was written, but before the old version was deleted.\n",
"\n",
@ -62,7 +62,7 @@
" \n",
"## Caution\n",
"\n",
"The record manager relies on a time-based mechanism to determine what content can be cleaned up (when using `full` or `incremental` delete modes).\n",
"The record manager relies on a time-based mechanism to determine what content can be cleaned up (when using `full` or `incremental` cleanup modes).\n",
"\n",
"If two tasks run back to back, and the first task finishes before the the clock time changes, then the second task may not be able to clean up content.\n",
"\n",
@ -197,7 +197,7 @@
"source": [
"def _clear():\n",
" \"\"\"Hacky helper method to clear content. See the `full` mode section to to understand why it works.\"\"\"\n",
" index([], record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
" index([], record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
]
},
{
@ -242,7 +242,7 @@
" [doc1, doc1, doc1, doc1, doc1],\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=None,\n",
" cleanup=None,\n",
" source_id_key=\"source\",\n",
")"
]
@ -276,7 +276,7 @@
],
"source": [
"index(\n",
" [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
" [doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key=\"source\"\n",
")"
]
},
@ -307,7 +307,7 @@
],
"source": [
"index(\n",
" [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
" [doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key=\"source\"\n",
")"
]
},
@ -351,7 +351,7 @@
" [doc1, doc2],\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=\"incremental\",\n",
" cleanup=\"incremental\",\n",
" source_id_key=\"source\",\n",
")"
]
@ -386,7 +386,7 @@
" [doc1, doc2],\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=\"incremental\",\n",
" cleanup=\"incremental\",\n",
" source_id_key=\"source\",\n",
")"
]
@ -418,7 +418,7 @@
],
"source": [
"index(\n",
" [], record_manager, vectorstore, delete_mode=\"incremental\", source_id_key=\"source\"\n",
" [], record_manager, vectorstore, cleanup=\"incremental\", source_id_key=\"source\"\n",
")"
]
},
@ -462,7 +462,7 @@
" [changed_doc_2],\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=\"incremental\",\n",
" cleanup=\"incremental\",\n",
" source_id_key=\"source\",\n",
")"
]
@ -519,7 +519,7 @@
}
],
"source": [
"index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
"index(all_docs, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
]
},
{
@ -587,7 +587,7 @@
}
],
"source": [
"index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
"index(all_docs, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
]
},
{
@ -693,7 +693,7 @@
" new_docs,\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=\"incremental\",\n",
" cleanup=\"incremental\",\n",
" source_id_key=\"source\",\n",
")"
]
@ -741,7 +741,7 @@
" changed_doggy_docs,\n",
" record_manager,\n",
" vectorstore,\n",
" delete_mode=\"incremental\",\n",
" cleanup=\"incremental\",\n",
" source_id_key=\"source\",\n",
")"
]
@ -866,7 +866,7 @@
}
],
"source": [
"index(loader, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
"index(loader, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
]
},
{

@ -169,7 +169,7 @@ def index(
vector_store: VectorStore,
*,
batch_size: int = 100,
delete_mode: Literal["incremental", "full", None] = None,
cleanup: Literal["incremental", "full", None] = None,
source_id_key: Union[str, Callable[[Document], str], None] = None,
) -> IndexingResult:
"""Index data from the loader into the vector store.
@ -195,7 +195,7 @@ def index(
updated.
vector_store: Vector store to index the documents into.
batch_size: Batch size to use when indexing.
delete_mode: How to handle clean up of documents.
cleanup: How to handle clean up of documents.
- Incremental: Cleans up all documents that haven't been updated AND
that are associated with source ids that were seen
during indexing.
@ -213,14 +213,14 @@ def index(
Indexing result which contains information about how many documents
were added, updated, deleted, or skipped.
"""
if delete_mode not in {"incremental", "full", None}:
if cleanup not in {"incremental", "full", None}:
raise ValueError(
f"delete_mode should be one of 'incremental', 'full' or None. "
f"Got {delete_mode}."
f"cleanup should be one of 'incremental', 'full' or None. "
f"Got {cleanup}."
)
if delete_mode == "incremental" and source_id_key is None:
raise ValueError("Source id key is required when delete mode is incremental.")
if cleanup == "incremental" and source_id_key is None:
raise ValueError("Source id key is required when cleanup mode is incremental.")
# Check that the Vectorstore has required methods implemented
methods = ["delete", "add_documents"]
@ -264,12 +264,12 @@ def index(
source_id_assigner(doc) for doc in hashed_docs
]
if delete_mode == "incremental":
# If the delete mode is incremental, source ids are required.
if cleanup == "incremental":
# If the cleanup mode is incremental, source ids are required.
for source_id, hashed_doc in zip(source_ids, hashed_docs):
if source_id is None:
raise ValueError(
"Source ids are required when delete mode is incremental. "
"Source ids are required when cleanup mode is incremental. "
f"Document that starts with "
f"content: {hashed_doc.page_content[:100]} was not assigned "
f"as source id."
@ -307,7 +307,7 @@ def index(
)
# If source IDs are provided, we can do the deletion incrementally!
if delete_mode == "incremental":
if cleanup == "incremental":
# Get the uids of the documents that were not returned by the loader.
# mypy isn't good enough to determine that source ids cannot be None
@ -328,7 +328,7 @@ def index(
record_manager.delete_keys(uids_to_delete)
num_deleted += len(uids_to_delete)
if delete_mode == "full":
if cleanup == "full":
uids_to_delete = record_manager.list_keys(before=index_start_dt)
if uids_to_delete:

@ -158,7 +158,7 @@ def test_index_simple_delete_full(
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
):
assert index(loader, record_manager, vector_store, delete_mode="full") == {
assert index(loader, record_manager, vector_store, cleanup="full") == {
"num_added": 2,
"num_deleted": 0,
"num_skipped": 0,
@ -168,7 +168,7 @@ def test_index_simple_delete_full(
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
):
assert index(loader, record_manager, vector_store, delete_mode="full") == {
assert index(loader, record_manager, vector_store, cleanup="full") == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 2,
@ -189,7 +189,7 @@ def test_index_simple_delete_full(
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
):
assert index(loader, record_manager, vector_store, delete_mode="full") == {
assert index(loader, record_manager, vector_store, cleanup="full") == {
"num_added": 1,
"num_deleted": 1,
"num_skipped": 1,
@ -207,7 +207,7 @@ def test_index_simple_delete_full(
with patch.object(
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
):
assert index(loader, record_manager, vector_store, delete_mode="full") == {
assert index(loader, record_manager, vector_store, cleanup="full") == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 2,
@ -238,7 +238,7 @@ def test_incremental_fails_with_bad_source_ids(
with pytest.raises(ValueError):
# Should raise an error because no source id function was specified
index(loader, record_manager, vector_store, delete_mode="incremental")
index(loader, record_manager, vector_store, cleanup="incremental")
with pytest.raises(ValueError):
# Should raise an error because no source id function was specified
@ -246,7 +246,7 @@ def test_incremental_fails_with_bad_source_ids(
loader,
record_manager,
vector_store,
delete_mode="incremental",
cleanup="incremental",
source_id_key="source",
)
@ -275,7 +275,7 @@ def test_no_delete(
loader,
record_manager,
vector_store,
delete_mode=None,
cleanup=None,
source_id_key="source",
) == {
"num_added": 2,
@ -292,7 +292,7 @@ def test_no_delete(
loader,
record_manager,
vector_store,
delete_mode=None,
cleanup=None,
source_id_key="source",
) == {
"num_added": 0,
@ -322,7 +322,7 @@ def test_no_delete(
loader,
record_manager,
vector_store,
delete_mode=None,
cleanup=None,
source_id_key="source",
) == {
"num_added": 1,
@ -356,7 +356,7 @@ def test_incremental_delete(
loader,
record_manager,
vector_store,
delete_mode="incremental",
cleanup="incremental",
source_id_key="source",
) == {
"num_added": 2,
@ -380,7 +380,7 @@ def test_incremental_delete(
loader,
record_manager,
vector_store,
delete_mode="incremental",
cleanup="incremental",
source_id_key="source",
) == {
"num_added": 0,
@ -415,7 +415,7 @@ def test_incremental_delete(
loader,
record_manager,
vector_store,
delete_mode="incremental",
cleanup="incremental",
source_id_key="source",
) == {
"num_added": 2,
@ -442,7 +442,7 @@ def test_indexing_with_no_docs(
"""Check edge case when loader returns no new docs."""
loader = ToyLoader(documents=[])
assert index(loader, record_manager, vector_store, delete_mode="full") == {
assert index(loader, record_manager, vector_store, cleanup="full") == {
"num_added": 0,
"num_deleted": 0,
"num_skipped": 0,
@ -466,7 +466,7 @@ def test_deduplication(
]
# Should result in only a single document being added
assert index(docs, record_manager, vector_store, delete_mode="full") == {
assert index(docs, record_manager, vector_store, cleanup="full") == {
"num_added": 1,
"num_deleted": 0,
"num_skipped": 0,

Loading…
Cancel
Save