From ddd07001f354cd09a76a61e1f5c678bf885506d2 Mon Sep 17 00:00:00 2001 From: Tarek Abouzeid Date: Sat, 9 Sep 2023 02:59:23 +0200 Subject: [PATCH] adding language as parameter to NLTK text splitter (#10229) - Description: Adding language as parameter to NLTK, by default it is only using English. This will help using NLTK splitter for other languages. Change is simple, via adding language as parameter to NLTKTextSplitter and then passing it to nltk "sent_tokenize". - Issue: N/A - Dependencies: N/A --------- Co-authored-by: Eugene Yurtsev --- libs/langchain/langchain/text_splitter.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index be189548c1..2e5f7021f3 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -1081,7 +1081,9 @@ class RecursiveCharacterTextSplitter(TextSplitter): class NLTKTextSplitter(TextSplitter): """Splitting text using NLTK package.""" - def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None: + def __init__( + self, separator: str = "\n\n", language: str = "english", **kwargs: Any + ) -> None: """Initialize the NLTK splitter.""" super().__init__(**kwargs) try: @@ -1093,11 +1095,12 @@ class NLTKTextSplitter(TextSplitter): "NLTK is not installed, please install it with `pip install nltk`." ) self._separator = separator + self._language = language def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" # First we naively split the large input into a bunch of smaller ones. - splits = self._tokenizer(text) + splits = self._tokenizer(text, language=self._language) return self._merge_splits(splits, self._separator)