Bug: incorrect start_index if the chunk is substring of another chunk

Sample code:

    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain.docstore.document import Document
    splitter = RecursiveCharacterTextSplitter(chunk_size=5, chunk_overlap=5, separators=[" ", ""], add_start_index=True)
    splitter.split_documents([Document(page_content="chunk chunk")])

    Before this commit:

    [Document(page_content='chunk', metadata={'start_index': 0}),
     Document(page_content='chun', metadata={'start_index': 0}),
     Document(page_content='chunk', metadata={'start_index': 0})]

    After this commit:

    [Document(page_content='chunk', metadata={'start_index': 0}),
     Document(page_content='chun', metadata={'start_index': 6}),
     Document(page_content='chunk', metadata={'start_index': 6})]

    This resolves https://github.com/langchain-ai/langchain/issues/21475
pull/21477/head
Ravi Maggon 3 weeks ago committed by GitHub
parent f178c67ad0
commit 41c034a96f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -73,13 +73,16 @@ class TextSplitter(BaseDocumentTransformer, ABC):
_metadatas = metadatas or [{}] * len(texts)
documents = []
for i, text in enumerate(texts):
index = 0
index = -1
previous_chunk_len = 0
for chunk in self.split_text(text):
for j, chunk in enumerate(self.split_text(text)):
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
offset = index + previous_chunk_len - self._chunk_overlap
index = text.find(chunk, max(0, offset))
if j > 0:
minimum_index_offset = max(0, previous_chunk_len - self._chunk_overlap, previous_chunk_len - len(chunk))
else:
minimum_index_offset = 1
index = text.find(chunk, index + minimum_index_offset)
metadata["start_index"] = index
previous_chunk_len = len(chunk)
new_doc = Document(page_content=chunk, metadata=metadata)

Loading…
Cancel
Save