langchain/tests/unit_tests/test_text_splitter.py

38 lines
1.3 KiB
Python
Raw Normal View History

2022-11-01 03:17:22 +00:00
"""Test text splitting functionality."""
import pytest
from langchain.text_splitter import CharacterTextSplitter
def test_character_text_splitter() -> None:
"""Test splitting by character count."""
text = "foo bar baz 123"
2022-12-19 01:21:43 +00:00
splitter = CharacterTextSplitter(separator=" ", chunk_size=7, chunk_overlap=3)
2022-11-01 03:17:22 +00:00
output = splitter.split_text(text)
expected_output = ["foo bar", "bar baz", "baz 123"]
assert output == expected_output
2022-12-19 01:21:43 +00:00
def test_character_text_splitter_long() -> None:
"""Test splitting by character count on long words."""
text = "foo bar baz a a"
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
output = splitter.split_text(text)
expected_output = ["foo", "bar", "baz", "a a"]
assert output == expected_output
2022-11-01 03:17:22 +00:00
def test_character_text_splitter_longer_words() -> None:
"""Test splitting by characters when splits not found easily."""
text = "foo bar baz 123"
splitter = CharacterTextSplitter(separator=" ", chunk_size=1, chunk_overlap=1)
output = splitter.split_text(text)
expected_output = ["foo", "bar", "baz", "123"]
assert output == expected_output
def test_character_text_splitting_args() -> None:
"""Test invalid arguments."""
with pytest.raises(ValueError):
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)