from __future__ import annotations from typing import Any, List from langchain_text_splitters.base import TextSplitter class SpacyTextSplitter(TextSplitter): """Splitting text using Spacy package. Per default, Spacy's `en_core_web_sm` model is used and its default max_length is 1000000 (it is the length of maximum character this model takes which can be increased for large files). For a faster, but potentially less accurate splitting, you can use `pipeline='sentencizer'`. """ def __init__( self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", max_length: int = 1_000_000, **kwargs: Any, ) -> None: """Initialize the spacy text splitter.""" super().__init__(**kwargs) self._tokenizer = _make_spacy_pipeline_for_splitting( pipeline, max_length=max_length ) self._separator = separator def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" splits = (s.text for s in self._tokenizer(text).sents) return self._merge_splits(splits, self._separator) def _make_spacy_pipeline_for_splitting( pipeline: str, *, max_length: int = 1_000_000 ) -> Any: # avoid importing spacy try: import spacy except ImportError: raise ImportError( "Spacy is not installed, please install it with `pip install spacy`." ) if pipeline == "sentencizer": from spacy.lang.en import English sentencizer: Any = English() sentencizer.add_pipe("sentencizer") else: sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) sentencizer.max_length = max_length return sentencizer