mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Add spacy sentencizer (#7442)
`SpacyTextSplitter` currently uses spacy's statistics-based `en_core_web_sm` model for sentence splitting. This is a good splitter, but it's also pretty slow, and in this case it's doing a lot of work that's not needed given that the spacy parse is then just thrown away. However, there is also a simple rules-based spacy sentencizer. Using this is at least an order of magnitude faster than using `en_core_web_sm` according to my local tests. Also, spacy sentence tokenization based on `en_core_web_sm` can be sped up in this case by not doing the NER stage. This shaves some cycles too, both when loading the model and when parsing the text. Consequently, this PR adds the option to use the basic spacy sentencizer, and it disables the NER stage for the current approach, *which is kept as the default*. Lastly, when extracting the tokenized sentences, the `text` attribute is called directly instead of doing the string conversion, which is IMO a bit more idiomatic.
This commit is contained in:
parent
50a9fcccb0
commit
7ffc431b3a
@ -34,6 +34,23 @@ logger = logging.getLogger(__name__)
|
|||||||
TS = TypeVar("TS", bound="TextSplitter")
|
TS = TypeVar("TS", bound="TextSplitter")
|
||||||
|
|
||||||
|
|
||||||
|
def _make_spacy_pipeline_for_splitting(pipeline: str) -> Any: # avoid importing spacy
|
||||||
|
try:
|
||||||
|
import spacy
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Spacy is not installed, please install it with `pip install spacy`."
|
||||||
|
)
|
||||||
|
if pipeline == "sentencizer":
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
sentencizer = English()
|
||||||
|
sentencizer.add_pipe("sentencizer")
|
||||||
|
else:
|
||||||
|
sentencizer = spacy.load(pipeline, disable=["ner"])
|
||||||
|
return sentencizer
|
||||||
|
|
||||||
|
|
||||||
def _split_text_with_regex(
|
def _split_text_with_regex(
|
||||||
text: str, separator: str, keep_separator: bool
|
text: str, separator: str, keep_separator: bool
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
@ -1010,25 +1027,24 @@ class NLTKTextSplitter(TextSplitter):
|
|||||||
|
|
||||||
|
|
||||||
class SpacyTextSplitter(TextSplitter):
|
class SpacyTextSplitter(TextSplitter):
|
||||||
"""Implementation of splitting text that looks at sentences using Spacy."""
|
"""Implementation of splitting text that looks at sentences using Spacy.
|
||||||
|
|
||||||
|
|
||||||
|
Per default, Spacy's `en_core_web_sm` model is used. For a faster, but
|
||||||
|
potentially less accurate splitting, you can use `pipeline='sentencizer'`.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
|
self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize the spacy text splitter."""
|
"""Initialize the spacy text splitter."""
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
try:
|
self._tokenizer = _make_spacy_pipeline_for_splitting(pipeline)
|
||||||
import spacy
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
"Spacy is not installed, please install it with `pip install spacy`."
|
|
||||||
)
|
|
||||||
self._tokenizer = spacy.load(pipeline)
|
|
||||||
self._separator = separator
|
self._separator = separator
|
||||||
|
|
||||||
def split_text(self, text: str) -> List[str]:
|
def split_text(self, text: str) -> List[str]:
|
||||||
"""Split incoming text and return chunks."""
|
"""Split incoming text and return chunks."""
|
||||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
splits = (s.text for s in self._tokenizer(text).sents)
|
||||||
return self._merge_splits(splits, self._separator)
|
return self._merge_splits(splits, self._separator)
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,11 +26,12 @@ def test_nltk_text_splitter() -> None:
|
|||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
|
||||||
|
|
||||||
def test_spacy_text_splitter() -> None:
|
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||||
|
def test_spacy_text_splitter(pipeline: str) -> None:
|
||||||
"""Test splitting by sentence using Spacy."""
|
"""Test splitting by sentence using Spacy."""
|
||||||
text = "This is sentence one. And this is sentence two."
|
text = "This is sentence one. And this is sentence two."
|
||||||
separator = "|||"
|
separator = "|||"
|
||||||
splitter = SpacyTextSplitter(separator=separator)
|
splitter = SpacyTextSplitter(separator=separator, pipeline=pipeline)
|
||||||
output = splitter.split_text(text)
|
output = splitter.split_text(text)
|
||||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||||
assert output == expected_output
|
assert output == expected_output
|
||||||
|
Loading…
Reference in New Issue
Block a user