adding language as parameter to NLTK text splitter (#10229)

- Description: 
Adding language as parameter to NLTK, by default it is only using
English. This will help using NLTK splitter for other languages. Change
is simple, via adding language as parameter to NLTKTextSplitter and then
passing it to nltk "sent_tokenize".
  
  - Issue: N/A
  
  - Dependencies: N/A

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
pull/10291/head
Tarek Abouzeid 12 months ago committed by GitHub
parent b3a8fc7cb1
commit ddd07001f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1081,7 +1081,9 @@ class RecursiveCharacterTextSplitter(TextSplitter):
class NLTKTextSplitter(TextSplitter):
"""Splitting text using NLTK package."""
def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None:
def __init__(
self, separator: str = "\n\n", language: str = "english", **kwargs: Any
) -> None:
"""Initialize the NLTK splitter."""
super().__init__(**kwargs)
try:
@ -1093,11 +1095,12 @@ class NLTKTextSplitter(TextSplitter):
"NLTK is not installed, please install it with `pip install nltk`."
)
self._separator = separator
self._language = language
def split_text(self, text: str) -> List[str]:
"""Split incoming text and return chunks."""
# First we naively split the large input into a bunch of smaller ones.
splits = self._tokenizer(text)
splits = self._tokenizer(text, language=self._language)
return self._merge_splits(splits, self._separator)

Loading…
Cancel
Save