mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
text-splitters[patch]: Modified SpacyTextSplitter to fully keep whitespace when strip_whitespace is false (#23272)
Previously, regardless of whether or not strip_whitespace was set to true or false, the strip text method in the SpacyTextSplitter class used `sent.text` to get the sentence. I modified this to include a ternary such that if strip_whitespace is false, it uses `sent.text_with_ws` I also modified the project.toml to include the spacy pipeline package and to lock the numpy version, as higher versions break spacy. - **Issue:** N/a - **Dependencies:** None
This commit is contained in:
parent
3145995ed9
commit
66828f4ecc
@ -20,6 +20,8 @@ class SpacyTextSplitter(TextSplitter):
|
||||
separator: str = "\n\n",
|
||||
pipeline: str = "en_core_web_sm",
|
||||
max_length: int = 1_000_000,
|
||||
*,
|
||||
strip_whitespace: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Initialize the spacy text splitter."""
|
||||
@ -28,10 +30,14 @@ class SpacyTextSplitter(TextSplitter):
|
||||
pipeline, max_length=max_length
|
||||
)
|
||||
self._separator = separator
|
||||
self._strip_whitespace = strip_whitespace
|
||||
|
||||
def split_text(self, text: str) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (s.text for s in self._tokenizer(text).sents)
|
||||
splits = (
|
||||
s.text if self._strip_whitespace else s.text_with_ws
|
||||
for s in self._tokenizer(text).sents
|
||||
)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
|
@ -37,3 +37,16 @@ def test_spacy_text_splitter(pipeline: str) -> None:
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = SpacyTextSplitter(
|
||||
separator=separator, pipeline=pipeline, strip_whitespace=False
|
||||
)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
Loading…
Reference in New Issue
Block a user