Add start index to metadata in TextSplitter (#5912)

#### Add start index to metadata in TextSplitter - Modified method `create_documents` to track start position of each chunk - The `start_index` is included in the metadata if the `add_start_index` parameter in the class constructor is set to `True` This enables referencing back to the original document, particularly useful when a specific chunk is retrieved.  #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev @agola11
1 year ago · 2791a753bf
parent a09a0e3511
commit 2791a753bf
3 changed files with 30 additions and 7 deletions
--- a/docs/modules/indexes/text_splitters/getting_started.ipynb
+++ b/docs/modules/indexes/text_splitters/getting_started.ipynb
@ -12,7 +12,8 @@
    "\n",
    "- `length_function`: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.\n",
    "- `chunk_size`: the maximum size of your chunks (as measured by the length function).\n",
-    "- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (eg do a sliding window)."
+    "- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (eg do a sliding window).\n",
+    "- `add_start_index` : wether to include the starting position of each chunk within the original document in the metadata. "
   ]
  },
  {
@ -49,6 +50,7 @@
    "    chunk_size = 100,\n",
    "    chunk_overlap  = 20,\n",
    "    length_function = len,\n",
+    "    add_start_index = True,\n",
    ")"
   ]
  },
@ -62,8 +64,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' lookup_str='' metadata={} lookup_index=0\n",
-      "page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' lookup_str='' metadata={} lookup_index=0\n"
+      "page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}\n",
+      "page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}\n"
     ]
    }
   ],
@ -90,7 +92,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.16"
  },
  "vscode": {
   "interpreter": {
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -58,6 +58,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        chunk_overlap: int = 200,
        length_function: Callable[[str], int] = len,
        keep_separator: bool = False,
+        add_start_index: bool = False,
    ):
        """Create a new TextSplitter.

@ -66,6 +67,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether or not to keep the separator in the chunks
+            add_start_index: If `True`, includes chunk's start index in metadata
        """
        if chunk_overlap > chunk_size:
            raise ValueError(
@ -76,6 +78,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        self._chunk_overlap = chunk_overlap
        self._length_function = length_function
        self._keep_separator = keep_separator
+        self._add_start_index = add_start_index

    @abstractmethod
    def split_text(self, text: str) -> List[str]:
@ -88,10 +91,13 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        _metadatas = metadatas or [{}] * len(texts)
        documents = []
        for i, text in enumerate(texts):
+            index = -1
            for chunk in self.split_text(text):
-                new_doc = Document(
-                    page_content=chunk, metadata=copy.deepcopy(_metadatas[i])
-                )
+                metadata = copy.deepcopy(_metadatas[i])
+                if self._add_start_index:
+                    index = text.find(chunk, index + 1)
+                    metadata["start_index"] = index
+                new_doc = Document(page_content=chunk, metadata=metadata)
                documents.append(new_doc)
        return documents

--- a/tests/unit_tests/test_text_splitter.py
+++ b/tests/unit_tests/test_text_splitter.py
@ -118,6 +118,21 @@ def test_create_documents_with_metadata() -> None:
    assert docs == expected_docs


+def test_create_documents_with_start_index() -> None:
+    """Test create documents method."""
+    texts = ["foo bar baz 123"]
+    splitter = CharacterTextSplitter(
+        separator=" ", chunk_size=7, chunk_overlap=3, add_start_index=True
+    )
+    docs = splitter.create_documents(texts)
+    expected_docs = [
+        Document(page_content="foo bar", metadata={"start_index": 0}),
+        Document(page_content="bar baz", metadata={"start_index": 4}),
+        Document(page_content="baz 123", metadata={"start_index": 8}),
+    ]
+    assert docs == expected_docs
+
+
 def test_metadata_not_shallow() -> None:
    """Test that metadatas are not shallow."""
    texts = ["foo bar"]