diff --git a/README.md b/README.md index b32ac6a8..a9f6ec27 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,8 @@ The following use cases require specific installs and api keys: - _FAISS_: - Install requirements with `pip install faiss` for Python 3.7 and `pip install faiss-cpu` for Python 3.10+. +If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`. + ## 🚀 What can I do with this This project was largely inspired by a few projects seen on Twitter for which we thought it would make sense to have more explicit tooling. A lot of the initial functionality was done in an attempt to recreate those. Those are: diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index d635e64f..1c44d987 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -1,23 +1,13 @@ """Functionality for splitting text.""" from abc import abstractmethod -from typing import List +from typing import Iterable, List class TextSplitter: """Interface for splitting text into chunks.""" - @abstractmethod - def split_text(self, text: str) -> List[str]: - """Split text into multiple components.""" - - -class CharacterTextSplitter(TextSplitter): - """Implementation of splitting text that looks at characters.""" - - def __init__( - self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200 - ): - """Initialize with parameters.""" + def __init__(self, separator: str, chunk_size: int, chunk_overlap: int): + """Create a new TextSplitter.""" if chunk_overlap > chunk_size: raise ValueError( f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " @@ -27,10 +17,11 @@ class CharacterTextSplitter(TextSplitter): self._chunk_size = chunk_size self._chunk_overlap = chunk_overlap + @abstractmethod def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - # First we naively split the large input into a bunch of smaller ones. - splits = text.split(self._separator) + """Split text into multiple components.""" + + def _merge_splits(self, splits: Iterable[str]) -> List[str]: # We now want to combine these smaller pieces into medium size # chunks to send to the LLM. docs = [] @@ -46,3 +37,72 @@ class CharacterTextSplitter(TextSplitter): total += len(d) docs.append(self._separator.join(current_doc)) return docs + + +class CharacterTextSplitter(TextSplitter): + """Implementation of splitting text that looks at characters.""" + + def __init__( + self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200 + ): + """Create a new CharacterTextSplitter.""" + super(CharacterTextSplitter, self).__init__( + separator, chunk_size, chunk_overlap + ) + self._separator = separator + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + splits = text.split(self._separator) + return self._merge_splits(splits) + + +class NLTKTextSplitter(TextSplitter): + """Implementation of splitting text that looks at sentences using NLTK.""" + + def __init__( + self, separator: str = "\n\n", chunk_size: int = 4000, chunk_overlap: int = 200 + ): + """Initialize the NLTK splitter.""" + super(NLTKTextSplitter, self).__init__(separator, chunk_size, chunk_overlap) + try: + from nltk.tokenize import sent_tokenize + + self._tokenizer = sent_tokenize + except ImportError: + raise ImportError( + "NLTK is not installed, please install it with `pip install nltk`." + ) + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + splits = self._tokenizer(text) + return self._merge_splits(splits) + + +class SpacyTextSplitter(TextSplitter): + """Implementation of splitting text that looks at sentences using Spacy.""" + + def __init__( + self, + separator: str = "\n\n", + pipeline: str = "en_core_web_sm", + chunk_size: int = 4000, + chunk_overlap: int = 200, + ): + """Initialize the spacy text splitter.""" + super(SpacyTextSplitter, self).__init__(separator, chunk_size, chunk_overlap) + try: + import spacy + except ImportError: + raise ImportError( + "Spacy is not installed, please install it with `pip install spacy`." + ) + self._tokenizer = spacy.load(pipeline) + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + splits = (str(s) for s in self._tokenizer(text).sents) + return self._merge_splits(splits) diff --git a/requirements.txt b/requirements.txt index 42513511..0cbdcbf2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,7 @@ wikipedia huggingface_hub faiss-cpu sentence_transformers +spacy +nltk # For development jupyter diff --git a/tests/integration_tests/test_nlp_text_splitters.py b/tests/integration_tests/test_nlp_text_splitters.py new file mode 100644 index 00000000..73409227 --- /dev/null +++ b/tests/integration_tests/test_nlp_text_splitters.py @@ -0,0 +1,38 @@ +""" +Test text splitting functionality using NLTK and Spacy based sentence splitters. +""" +import pytest + +from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter + + +def test_nltk_text_splitting_args() -> None: + """Test invalid arguments.""" + with pytest.raises(ValueError): + NLTKTextSplitter(chunk_size=2, chunk_overlap=4) + + +def test_spacy_text_splitting_args() -> None: + """Test invalid arguments.""" + with pytest.raises(ValueError): + SpacyTextSplitter(chunk_size=2, chunk_overlap=4) + + +def test_nltk_text_splitter() -> None: + """Test splitting by sentence using NLTK.""" + text = "This is sentence one. And this is sentence two." + separator = "|||" + splitter = NLTKTextSplitter(separator=separator) + output = splitter.split_text(text) + expected_output = [f"This is sentence one.{separator}And this is sentence two."] + assert output == expected_output + + +def test_spacy_text_splitter() -> None: + """Test splitting by sentence using Spacy.""" + text = "This is sentence one. And this is sentence two." + separator = "|||" + splitter = SpacyTextSplitter(separator=separator) + output = splitter.split_text(text) + expected_output = [f"This is sentence one.{separator}And this is sentence two."] + assert output == expected_output