mirror of https://github.com/hwchase17/langchain
Implements NLTK and Spacy-based TextSplitters (#103)
This PR is for Issue #88 - [x] `make format` - [x] `make lint` - [x] `make tests`pull/91/head
parent
28282ad099
commit
3ee6e332dd
@ -0,0 +1,38 @@
|
||||
"""
|
||||
Test text splitting functionality using NLTK and Spacy based sentence splitters.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter
|
||||
|
||||
|
||||
def test_nltk_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
NLTKTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_spacy_text_splitting_args() -> None:
|
||||
"""Test invalid arguments."""
|
||||
with pytest.raises(ValueError):
|
||||
SpacyTextSplitter(chunk_size=2, chunk_overlap=4)
|
||||
|
||||
|
||||
def test_nltk_text_splitter() -> None:
|
||||
"""Test splitting by sentence using NLTK."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = NLTKTextSplitter(separator=separator)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
def test_spacy_text_splitter() -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = SpacyTextSplitter(separator=separator)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
Loading…
Reference in New Issue