|
|
|
@ -5,7 +5,39 @@ from typing import List, Optional, Sequence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RecordManager(ABC):
|
|
|
|
|
"""Abstract base class representing the interface for a record manager."""
|
|
|
|
|
"""Abstract base class representing the interface for a record manager.
|
|
|
|
|
|
|
|
|
|
The record manager abstraction is used by the langchain indexing API.
|
|
|
|
|
|
|
|
|
|
The record manager keeps track of which documents have been
|
|
|
|
|
written into a vectorstore and when they were written.
|
|
|
|
|
|
|
|
|
|
The indexing API computes hashes for each document and stores the hash
|
|
|
|
|
together with the write time and the source id in the record manager.
|
|
|
|
|
|
|
|
|
|
On subsequent indexing runs, the indexing API can check the record manager
|
|
|
|
|
to determine which documents have already been indexed and which have not.
|
|
|
|
|
|
|
|
|
|
This allows the indexing API to avoid re-indexing documents that have
|
|
|
|
|
already been indexed, and to only index new documents.
|
|
|
|
|
|
|
|
|
|
The main benefit of this abstraction is that it works across many vectorstores.
|
|
|
|
|
To be supported, a vectorstore needs to only support the ability to add and
|
|
|
|
|
delete documents by ID. Using the record manager, the indexing API will
|
|
|
|
|
be able to delete outdated documents and avoid redundant indexing of documents
|
|
|
|
|
that have already been indexed.
|
|
|
|
|
|
|
|
|
|
The main constraints of this abstraction are:
|
|
|
|
|
|
|
|
|
|
1. It relies on the time-stamps to determine which documents have been
|
|
|
|
|
indexed and which have not. This means that the time-stamps must be
|
|
|
|
|
monotonically increasing. The timestamp should be the timestamp
|
|
|
|
|
as measured by the server to minimize issues.
|
|
|
|
|
2. The record manager is currently implemented separately from the
|
|
|
|
|
vectorstore, which means that the overall system becomes distributed
|
|
|
|
|
and may create issues with consistency. For example, writing to
|
|
|
|
|
record manager succeeds but corresponding writing to vectorstore fails.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|