text-splitters[patch]: Modified SpacyTextSplitter to fully keep whitespace when strip_whitespace is false (#23272)

Previously, regardless of whether or not strip_whitespace was set to
true or false, the strip text method in the SpacyTextSplitter class used
`sent.text` to get the sentence. I modified this to include a ternary
such that if strip_whitespace is false, it uses `sent.text_with_ws`
I also modified the project.toml to include the spacy pipeline package
and to lock the numpy version, as higher versions break spacy.

- **Issue:** N/a
- **Dependencies:** None
This commit is contained in:
Matthew DeGenaro 2024-09-02 17:15:56 -04:00 committed by GitHub
parent 3145995ed9
commit 66828f4ecc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 20 additions and 1 deletions

View File

@ -20,6 +20,8 @@ class SpacyTextSplitter(TextSplitter):
separator: str = "\n\n",
pipeline: str = "en_core_web_sm",
max_length: int = 1_000_000,
*,
strip_whitespace: bool = True,
**kwargs: Any,
) -> None:
"""Initialize the spacy text splitter."""
@ -28,10 +30,14 @@ class SpacyTextSplitter(TextSplitter):
pipeline, max_length=max_length
)
self._separator = separator
self._strip_whitespace = strip_whitespace
def split_text(self, text: str) -> List[str]:
"""Split incoming text and return chunks."""
splits = (s.text for s in self._tokenizer(text).sents)
splits = (
s.text if self._strip_whitespace else s.text_with_ws
for s in self._tokenizer(text).sents
)
return self._merge_splits(splits, self._separator)

View File

@ -37,3 +37,16 @@ def test_spacy_text_splitter(pipeline: str) -> None:
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = SpacyTextSplitter(
separator=separator, pipeline=pipeline, strip_whitespace=False
)
output = splitter.split_text(text)
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
assert output == expected_output