diff --git a/libs/core/langchain_core/indexing/api.py b/libs/core/langchain_core/indexing/api.py index 01bc9bffb2..86741ae3a2 100644 --- a/libs/core/langchain_core/indexing/api.py +++ b/libs/core/langchain_core/indexing/api.py @@ -214,10 +214,18 @@ def index( are not able to specify the uid of the document. IMPORTANT: - if auto_cleanup is set to True, the loader should be returning - the entire dataset, and not just a subset of the dataset. - Otherwise, the auto_cleanup will remove documents that it is not - supposed to. + * if auto_cleanup is set to True, the loader should be returning + the entire dataset, and not just a subset of the dataset. + Otherwise, the auto_cleanup will remove documents that it is not + supposed to. + * In incremental mode, if documents associated with a particular + source id appear across different batches, the indexing API + will do some redundant work. This will still result in the + correct end state of the index, but will unfortunately not be + 100% efficient. For example, if a given document is split into 15 + chunks, and we index them using a batch size of 5, we'll have 3 batches + all with the same source id. In general, to avoid doing too much + redundant work select as big a batch size as possible. Args: docs_source: Data loader or iterable of documents to index. diff --git a/libs/core/langchain_core/indexing/base.py b/libs/core/langchain_core/indexing/base.py index 776f1f1089..ac73191bbe 100644 --- a/libs/core/langchain_core/indexing/base.py +++ b/libs/core/langchain_core/indexing/base.py @@ -5,7 +5,39 @@ from typing import List, Optional, Sequence class RecordManager(ABC): - """Abstract base class representing the interface for a record manager.""" + """Abstract base class representing the interface for a record manager. + + The record manager abstraction is used by the langchain indexing API. + + The record manager keeps track of which documents have been + written into a vectorstore and when they were written. + + The indexing API computes hashes for each document and stores the hash + together with the write time and the source id in the record manager. + + On subsequent indexing runs, the indexing API can check the record manager + to determine which documents have already been indexed and which have not. + + This allows the indexing API to avoid re-indexing documents that have + already been indexed, and to only index new documents. + + The main benefit of this abstraction is that it works across many vectorstores. + To be supported, a vectorstore needs to only support the ability to add and + delete documents by ID. Using the record manager, the indexing API will + be able to delete outdated documents and avoid redundant indexing of documents + that have already been indexed. + + The main constraints of this abstraction are: + + 1. It relies on the time-stamps to determine which documents have been + indexed and which have not. This means that the time-stamps must be + monotonically increasing. The timestamp should be the timestamp + as measured by the server to minimize issues. + 2. The record manager is currently implemented separately from the + vectorstore, which means that the overall system becomes distributed + and may create issues with consistency. For example, writing to + record manager succeeds but corresponding writing to vectorstore fails. + """ def __init__( self,