Fix inconsistent behavior of CharacterTextSplitter when changing keep_separator (#7263)

- Description:
- When `keep_separator` is `True` the `_split_text_with_regex()` method
in `text_splitter` uses regex to split, but when `keep_separator` is
`False` it uses `str.split()`. This causes problems when the separator
is a special regex character like `.` or `*`. This PR fixes that by
using `re.split()` in both cases.
- Issue: #7262 
- Tag maintainer: @baskaryan
This commit is contained in:
Sasmitha Manathunga 2023-07-06 19:00:03 +05:30 committed by GitHub
parent b151d4257a
commit 0c7a5cb206
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 1 deletions

View File

@ -47,7 +47,7 @@ def _split_text_with_regex(
splits += _splits[-1:]
splits = [_splits[0]] + splits
else:
splits = text.split(separator)
splits = re.split(separator, text)
else:
splits = list(text)
return [s for s in splits if s != ""]

View File

@ -80,6 +80,31 @@ def test_character_text_splitter_longer_words() -> None:
assert output == expected_output
def test_character_text_splitter_keep_separator_regex() -> None:
"""Test splitting by characters while keeping the separator
that is a regex special character.
"""
text = "foo.bar.baz.123"
splitter = CharacterTextSplitter(
separator=r"\.", chunk_size=1, chunk_overlap=0, keep_separator=True
)
output = splitter.split_text(text)
expected_output = ["foo", ".bar", ".baz", ".123"]
assert output == expected_output
def test_character_text_splitter_discard_separator_regex() -> None:
"""Test splitting by characters discarding the separator
that is a regex special character."""
text = "foo.bar.baz.123"
splitter = CharacterTextSplitter(
separator=r"\.", chunk_size=1, chunk_overlap=0, keep_separator=False
)
output = splitter.split_text(text)
expected_output = ["foo", "bar", "baz", "123"]
assert output == expected_output
def test_character_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):