mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
index rename delete_mode -> cleanup (#10103)
This commit is contained in:
parent
427f696fb0
commit
71c418725f
@ -34,7 +34,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"When indexing documents into a vector store, it's possible that some existing documents in the vector store should be deleted. In certain situations you may want to remove any existing documents that are derived from the same sources as the new documents being indexed. In others you may want to delete all existing documents wholesale. The indexing API deletion modes let you pick the behavior you want:\n",
|
"When indexing documents into a vector store, it's possible that some existing documents in the vector store should be deleted. In certain situations you may want to remove any existing documents that are derived from the same sources as the new documents being indexed. In others you may want to delete all existing documents wholesale. The indexing API deletion modes let you pick the behavior you want:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"| Delete Mode | De-Duplicates Content | Parallelizable | Cleans Up Deleted Source Docs | Cleans Up Mutations of Source Docs and/or Derived Docs | Clean Up Timing |\n",
|
"| Cleanup Mode | De-Duplicates Content | Parallelizable | Cleans Up Deleted Source Docs | Cleans Up Mutations of Source Docs and/or Derived Docs | Clean Up Timing |\n",
|
||||||
"|-------------|-----------------------|---------------|----------------------------------|----------------------------------------------------|---------------------|\n",
|
"|-------------|-----------------------|---------------|----------------------------------|----------------------------------------------------|---------------------|\n",
|
||||||
"| None | ✅ | ✅ | ❌ | ❌ | - |\n",
|
"| None | ✅ | ✅ | ❌ | ❌ | - |\n",
|
||||||
"| Incremental | ✅ | ✅ | ❌ | ✅ | Continuously |\n",
|
"| Incremental | ✅ | ✅ | ❌ | ✅ | Continuously |\n",
|
||||||
@ -46,7 +46,7 @@
|
|||||||
"`incremental` and `full` offer the following automated clean up:\n",
|
"`incremental` and `full` offer the following automated clean up:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"* If the content of source document or derived documents has **changed**, both `incremental` or `full` modes will clean up (delete) previous versions of the content.\n",
|
"* If the content of source document or derived documents has **changed**, both `incremental` or `full` modes will clean up (delete) previous versions of the content.\n",
|
||||||
"* If the source document has been **deleted** (meaning it is not included in the documents currently being indexed), the `full` delete mode will delete it from the vector store correctly, but the `incremental` mode will not.\n",
|
"* If the source document has been **deleted** (meaning it is not included in the documents currently being indexed), the `full` cleanup mode will delete it from the vector store correctly, but the `incremental` mode will not.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"When content is mutated (e.g., the source PDF file was revised) there will be a period of time during indexing when both the new and old versions may be returned to the user. This happens after the new content was written, but before the old version was deleted.\n",
|
"When content is mutated (e.g., the source PDF file was revised) there will be a period of time during indexing when both the new and old versions may be returned to the user. This happens after the new content was written, but before the old version was deleted.\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -62,7 +62,7 @@
|
|||||||
" \n",
|
" \n",
|
||||||
"## Caution\n",
|
"## Caution\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The record manager relies on a time-based mechanism to determine what content can be cleaned up (when using `full` or `incremental` delete modes).\n",
|
"The record manager relies on a time-based mechanism to determine what content can be cleaned up (when using `full` or `incremental` cleanup modes).\n",
|
||||||
"\n",
|
"\n",
|
||||||
"If two tasks run back to back, and the first task finishes before the the clock time changes, then the second task may not be able to clean up content.\n",
|
"If two tasks run back to back, and the first task finishes before the the clock time changes, then the second task may not be able to clean up content.\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -197,7 +197,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def _clear():\n",
|
"def _clear():\n",
|
||||||
" \"\"\"Hacky helper method to clear content. See the `full` mode section to to understand why it works.\"\"\"\n",
|
" \"\"\"Hacky helper method to clear content. See the `full` mode section to to understand why it works.\"\"\"\n",
|
||||||
" index([], record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
|
" index([], record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -242,7 +242,7 @@
|
|||||||
" [doc1, doc1, doc1, doc1, doc1],\n",
|
" [doc1, doc1, doc1, doc1, doc1],\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=None,\n",
|
" cleanup=None,\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -276,7 +276,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(\n",
|
"index(\n",
|
||||||
" [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
|
" [doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key=\"source\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -307,7 +307,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(\n",
|
"index(\n",
|
||||||
" [doc1, doc2], record_manager, vectorstore, delete_mode=None, source_id_key=\"source\"\n",
|
" [doc1, doc2], record_manager, vectorstore, cleanup=None, source_id_key=\"source\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -351,7 +351,7 @@
|
|||||||
" [doc1, doc2],\n",
|
" [doc1, doc2],\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=\"incremental\",\n",
|
" cleanup=\"incremental\",\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -386,7 +386,7 @@
|
|||||||
" [doc1, doc2],\n",
|
" [doc1, doc2],\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=\"incremental\",\n",
|
" cleanup=\"incremental\",\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -418,7 +418,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(\n",
|
"index(\n",
|
||||||
" [], record_manager, vectorstore, delete_mode=\"incremental\", source_id_key=\"source\"\n",
|
" [], record_manager, vectorstore, cleanup=\"incremental\", source_id_key=\"source\"\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -462,7 +462,7 @@
|
|||||||
" [changed_doc_2],\n",
|
" [changed_doc_2],\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=\"incremental\",\n",
|
" cleanup=\"incremental\",\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -519,7 +519,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
|
"index(all_docs, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -587,7 +587,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(all_docs, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
|
"index(all_docs, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -693,7 +693,7 @@
|
|||||||
" new_docs,\n",
|
" new_docs,\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=\"incremental\",\n",
|
" cleanup=\"incremental\",\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -741,7 +741,7 @@
|
|||||||
" changed_doggy_docs,\n",
|
" changed_doggy_docs,\n",
|
||||||
" record_manager,\n",
|
" record_manager,\n",
|
||||||
" vectorstore,\n",
|
" vectorstore,\n",
|
||||||
" delete_mode=\"incremental\",\n",
|
" cleanup=\"incremental\",\n",
|
||||||
" source_id_key=\"source\",\n",
|
" source_id_key=\"source\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
@ -866,7 +866,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index(loader, record_manager, vectorstore, delete_mode=\"full\", source_id_key=\"source\")"
|
"index(loader, record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -169,7 +169,7 @@ def index(
|
|||||||
vector_store: VectorStore,
|
vector_store: VectorStore,
|
||||||
*,
|
*,
|
||||||
batch_size: int = 100,
|
batch_size: int = 100,
|
||||||
delete_mode: Literal["incremental", "full", None] = None,
|
cleanup: Literal["incremental", "full", None] = None,
|
||||||
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
source_id_key: Union[str, Callable[[Document], str], None] = None,
|
||||||
) -> IndexingResult:
|
) -> IndexingResult:
|
||||||
"""Index data from the loader into the vector store.
|
"""Index data from the loader into the vector store.
|
||||||
@ -195,7 +195,7 @@ def index(
|
|||||||
updated.
|
updated.
|
||||||
vector_store: Vector store to index the documents into.
|
vector_store: Vector store to index the documents into.
|
||||||
batch_size: Batch size to use when indexing.
|
batch_size: Batch size to use when indexing.
|
||||||
delete_mode: How to handle clean up of documents.
|
cleanup: How to handle clean up of documents.
|
||||||
- Incremental: Cleans up all documents that haven't been updated AND
|
- Incremental: Cleans up all documents that haven't been updated AND
|
||||||
that are associated with source ids that were seen
|
that are associated with source ids that were seen
|
||||||
during indexing.
|
during indexing.
|
||||||
@ -213,14 +213,14 @@ def index(
|
|||||||
Indexing result which contains information about how many documents
|
Indexing result which contains information about how many documents
|
||||||
were added, updated, deleted, or skipped.
|
were added, updated, deleted, or skipped.
|
||||||
"""
|
"""
|
||||||
if delete_mode not in {"incremental", "full", None}:
|
if cleanup not in {"incremental", "full", None}:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"delete_mode should be one of 'incremental', 'full' or None. "
|
f"cleanup should be one of 'incremental', 'full' or None. "
|
||||||
f"Got {delete_mode}."
|
f"Got {cleanup}."
|
||||||
)
|
)
|
||||||
|
|
||||||
if delete_mode == "incremental" and source_id_key is None:
|
if cleanup == "incremental" and source_id_key is None:
|
||||||
raise ValueError("Source id key is required when delete mode is incremental.")
|
raise ValueError("Source id key is required when cleanup mode is incremental.")
|
||||||
|
|
||||||
# Check that the Vectorstore has required methods implemented
|
# Check that the Vectorstore has required methods implemented
|
||||||
methods = ["delete", "add_documents"]
|
methods = ["delete", "add_documents"]
|
||||||
@ -264,12 +264,12 @@ def index(
|
|||||||
source_id_assigner(doc) for doc in hashed_docs
|
source_id_assigner(doc) for doc in hashed_docs
|
||||||
]
|
]
|
||||||
|
|
||||||
if delete_mode == "incremental":
|
if cleanup == "incremental":
|
||||||
# If the delete mode is incremental, source ids are required.
|
# If the cleanup mode is incremental, source ids are required.
|
||||||
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
||||||
if source_id is None:
|
if source_id is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Source ids are required when delete mode is incremental. "
|
"Source ids are required when cleanup mode is incremental. "
|
||||||
f"Document that starts with "
|
f"Document that starts with "
|
||||||
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
f"content: {hashed_doc.page_content[:100]} was not assigned "
|
||||||
f"as source id."
|
f"as source id."
|
||||||
@ -307,7 +307,7 @@ def index(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# If source IDs are provided, we can do the deletion incrementally!
|
# If source IDs are provided, we can do the deletion incrementally!
|
||||||
if delete_mode == "incremental":
|
if cleanup == "incremental":
|
||||||
# Get the uids of the documents that were not returned by the loader.
|
# Get the uids of the documents that were not returned by the loader.
|
||||||
|
|
||||||
# mypy isn't good enough to determine that source ids cannot be None
|
# mypy isn't good enough to determine that source ids cannot be None
|
||||||
@ -328,7 +328,7 @@ def index(
|
|||||||
record_manager.delete_keys(uids_to_delete)
|
record_manager.delete_keys(uids_to_delete)
|
||||||
num_deleted += len(uids_to_delete)
|
num_deleted += len(uids_to_delete)
|
||||||
|
|
||||||
if delete_mode == "full":
|
if cleanup == "full":
|
||||||
uids_to_delete = record_manager.list_keys(before=index_start_dt)
|
uids_to_delete = record_manager.list_keys(before=index_start_dt)
|
||||||
|
|
||||||
if uids_to_delete:
|
if uids_to_delete:
|
||||||
|
@ -158,7 +158,7 @@ def test_index_simple_delete_full(
|
|||||||
with patch.object(
|
with patch.object(
|
||||||
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||||
):
|
):
|
||||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 2,
|
"num_added": 2,
|
||||||
"num_deleted": 0,
|
"num_deleted": 0,
|
||||||
"num_skipped": 0,
|
"num_skipped": 0,
|
||||||
@ -168,7 +168,7 @@ def test_index_simple_delete_full(
|
|||||||
with patch.object(
|
with patch.object(
|
||||||
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
record_manager, "get_time", return_value=datetime(2021, 1, 1).timestamp()
|
||||||
):
|
):
|
||||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 0,
|
"num_added": 0,
|
||||||
"num_deleted": 0,
|
"num_deleted": 0,
|
||||||
"num_skipped": 2,
|
"num_skipped": 2,
|
||||||
@ -189,7 +189,7 @@ def test_index_simple_delete_full(
|
|||||||
with patch.object(
|
with patch.object(
|
||||||
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||||
):
|
):
|
||||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 1,
|
"num_added": 1,
|
||||||
"num_deleted": 1,
|
"num_deleted": 1,
|
||||||
"num_skipped": 1,
|
"num_skipped": 1,
|
||||||
@ -207,7 +207,7 @@ def test_index_simple_delete_full(
|
|||||||
with patch.object(
|
with patch.object(
|
||||||
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp()
|
||||||
):
|
):
|
||||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 0,
|
"num_added": 0,
|
||||||
"num_deleted": 0,
|
"num_deleted": 0,
|
||||||
"num_skipped": 2,
|
"num_skipped": 2,
|
||||||
@ -238,7 +238,7 @@ def test_incremental_fails_with_bad_source_ids(
|
|||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# Should raise an error because no source id function was specified
|
# Should raise an error because no source id function was specified
|
||||||
index(loader, record_manager, vector_store, delete_mode="incremental")
|
index(loader, record_manager, vector_store, cleanup="incremental")
|
||||||
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# Should raise an error because no source id function was specified
|
# Should raise an error because no source id function was specified
|
||||||
@ -246,7 +246,7 @@ def test_incremental_fails_with_bad_source_ids(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode="incremental",
|
cleanup="incremental",
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -275,7 +275,7 @@ def test_no_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode=None,
|
cleanup=None,
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 2,
|
"num_added": 2,
|
||||||
@ -292,7 +292,7 @@ def test_no_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode=None,
|
cleanup=None,
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 0,
|
"num_added": 0,
|
||||||
@ -322,7 +322,7 @@ def test_no_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode=None,
|
cleanup=None,
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 1,
|
"num_added": 1,
|
||||||
@ -356,7 +356,7 @@ def test_incremental_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode="incremental",
|
cleanup="incremental",
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 2,
|
"num_added": 2,
|
||||||
@ -380,7 +380,7 @@ def test_incremental_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode="incremental",
|
cleanup="incremental",
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 0,
|
"num_added": 0,
|
||||||
@ -415,7 +415,7 @@ def test_incremental_delete(
|
|||||||
loader,
|
loader,
|
||||||
record_manager,
|
record_manager,
|
||||||
vector_store,
|
vector_store,
|
||||||
delete_mode="incremental",
|
cleanup="incremental",
|
||||||
source_id_key="source",
|
source_id_key="source",
|
||||||
) == {
|
) == {
|
||||||
"num_added": 2,
|
"num_added": 2,
|
||||||
@ -442,7 +442,7 @@ def test_indexing_with_no_docs(
|
|||||||
"""Check edge case when loader returns no new docs."""
|
"""Check edge case when loader returns no new docs."""
|
||||||
loader = ToyLoader(documents=[])
|
loader = ToyLoader(documents=[])
|
||||||
|
|
||||||
assert index(loader, record_manager, vector_store, delete_mode="full") == {
|
assert index(loader, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 0,
|
"num_added": 0,
|
||||||
"num_deleted": 0,
|
"num_deleted": 0,
|
||||||
"num_skipped": 0,
|
"num_skipped": 0,
|
||||||
@ -466,7 +466,7 @@ def test_deduplication(
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Should result in only a single document being added
|
# Should result in only a single document being added
|
||||||
assert index(docs, record_manager, vector_store, delete_mode="full") == {
|
assert index(docs, record_manager, vector_store, cleanup="full") == {
|
||||||
"num_added": 1,
|
"num_added": 1,
|
||||||
"num_deleted": 0,
|
"num_deleted": 0,
|
||||||
"num_skipped": 0,
|
"num_skipped": 0,
|
||||||
|
Loading…
Reference in New Issue
Block a user