core[patch]: Expand documentation in the indexing namespace (#23134)

2024-11-06 03:20:49 +00:00 · 2024-06-19 10:11:44 -04:00 · 2024-06-19 10:11:44 -04:00 · 4fe8403bfb
commit 4fe8403bfb
parent fe4f10047b
2 changed files with 45 additions and 5 deletions
--- a/libs/core/langchain_core/indexing/api.py
+++ b/libs/core/langchain_core/indexing/api.py
@ -214,10 +214,18 @@ def index(
     are not able to specify the uid of the document.

    IMPORTANT:
-       if auto_cleanup is set to True, the loader should be returning
+       * if auto_cleanup is set to True, the loader should be returning
         the entire dataset, and not just a subset of the dataset.
         Otherwise, the auto_cleanup will remove documents that it is not
         supposed to.
+       * In incremental mode, if documents associated with a particular
+         source id appear across different batches, the indexing API
+         will do some redundant work. This will still result in the
+         correct end state of the index, but will unfortunately not be
+         100% efficient. For example, if a given document is split into 15
+         chunks, and we index them using a batch size of 5, we'll have 3 batches
+         all with the same source id. In general, to avoid doing too much
+         redundant work select as big a batch size as possible.

    Args:
        docs_source: Data loader or iterable of documents to index.
--- a/libs/core/langchain_core/indexing/base.py
+++ b/libs/core/langchain_core/indexing/base.py
@ -5,7 +5,39 @@ from typing import List, Optional, Sequence


 class RecordManager(ABC):
-    """Abstract base class representing the interface for a record manager."""
+    """Abstract base class representing the interface for a record manager.
+
+    The record manager abstraction is used by the langchain indexing API.
+
+    The record manager keeps track of which documents have been
+    written into a vectorstore and when they were written.
+
+    The indexing API computes hashes for each document and stores the hash
+    together with the write time and the source id in the record manager.
+
+    On subsequent indexing runs, the indexing API can check the record manager
+    to determine which documents have already been indexed and which have not.
+
+    This allows the indexing API to avoid re-indexing documents that have
+    already been indexed, and to only index new documents.
+
+    The main benefit of this abstraction is that it works across many vectorstores.
+    To be supported, a vectorstore needs to only support the ability to add and
+    delete documents by ID. Using the record manager, the indexing API will
+    be able to delete outdated documents and avoid redundant indexing of documents
+    that have already been indexed.
+
+    The main constraints of this abstraction are:
+
+    1. It relies on the time-stamps to determine which documents have been
+       indexed and which have not. This means that the time-stamps must be
+       monotonically increasing. The timestamp should be the timestamp
+       as measured by the server to minimize issues.
+    2. The record manager is currently implemented separately from the
+       vectorstore, which means that the overall system becomes distributed
+       and may create issues with consistency. For example, writing to
+       record manager succeeds but corresponding writing to vectorstore fails.
+    """

    def __init__(
        self,