diff --git a/docs/modules/indexes/text_splitters/getting_started.ipynb b/docs/modules/indexes/text_splitters/getting_started.ipynb index ad8e179da4..ac396f786c 100644 --- a/docs/modules/indexes/text_splitters/getting_started.ipynb +++ b/docs/modules/indexes/text_splitters/getting_started.ipynb @@ -12,7 +12,8 @@ "\n", "- `length_function`: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.\n", "- `chunk_size`: the maximum size of your chunks (as measured by the length function).\n", - "- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (eg do a sliding window)." + "- `chunk_overlap`: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (eg do a sliding window).\n", + "- `add_start_index` : wether to include the starting position of each chunk within the original document in the metadata. " ] }, { @@ -49,6 +50,7 @@ " chunk_size = 100,\n", " chunk_overlap = 20,\n", " length_function = len,\n", + " add_start_index = True,\n", ")" ] }, @@ -62,8 +64,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' lookup_str='' metadata={} lookup_index=0\n", - "page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' lookup_str='' metadata={} lookup_index=0\n" + "page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}\n", + "page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' metadata={'start_index': 82}\n" ] } ], @@ -90,7 +92,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.9.16" }, "vscode": { "interpreter": { diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index fddeb02307..23332f291a 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -58,6 +58,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): chunk_overlap: int = 200, length_function: Callable[[str], int] = len, keep_separator: bool = False, + add_start_index: bool = False, ): """Create a new TextSplitter. @@ -66,6 +67,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): chunk_overlap: Overlap in characters between chunks length_function: Function that measures the length of given chunks keep_separator: Whether or not to keep the separator in the chunks + add_start_index: If `True`, includes chunk's start index in metadata """ if chunk_overlap > chunk_size: raise ValueError( @@ -76,6 +78,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): self._chunk_overlap = chunk_overlap self._length_function = length_function self._keep_separator = keep_separator + self._add_start_index = add_start_index @abstractmethod def split_text(self, text: str) -> List[str]: @@ -88,10 +91,13 @@ class TextSplitter(BaseDocumentTransformer, ABC): _metadatas = metadatas or [{}] * len(texts) documents = [] for i, text in enumerate(texts): + index = -1 for chunk in self.split_text(text): - new_doc = Document( - page_content=chunk, metadata=copy.deepcopy(_metadatas[i]) - ) + metadata = copy.deepcopy(_metadatas[i]) + if self._add_start_index: + index = text.find(chunk, index + 1) + metadata["start_index"] = index + new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) return documents diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py index 7415728b89..2da634cd38 100644 --- a/tests/unit_tests/test_text_splitter.py +++ b/tests/unit_tests/test_text_splitter.py @@ -118,6 +118,21 @@ def test_create_documents_with_metadata() -> None: assert docs == expected_docs +def test_create_documents_with_start_index() -> None: + """Test create documents method.""" + texts = ["foo bar baz 123"] + splitter = CharacterTextSplitter( + separator=" ", chunk_size=7, chunk_overlap=3, add_start_index=True + ) + docs = splitter.create_documents(texts) + expected_docs = [ + Document(page_content="foo bar", metadata={"start_index": 0}), + Document(page_content="bar baz", metadata={"start_index": 4}), + Document(page_content="baz 123", metadata={"start_index": 8}), + ] + assert docs == expected_docs + + def test_metadata_not_shallow() -> None: """Test that metadatas are not shallow.""" texts = ["foo bar"]