2022-11-01 03:17:22 +00:00
|
|
|
"""Test text splitting functionality."""
|
|
|
|
import pytest
|
|
|
|
|
2022-12-21 03:24:08 +00:00
|
|
|
from langchain.docstore.document import Document
|
2023-01-08 23:11:10 +00:00
|
|
|
from langchain.text_splitter import (
|
|
|
|
CharacterTextSplitter,
|
|
|
|
RecursiveCharacterTextSplitter,
|
|
|
|
)
|
2022-11-01 03:17:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_character_text_splitter() -> None:
|
|
|
|
"""Test splitting by character count."""
|
|
|
|
text = "foo bar baz 123"
|
2022-12-19 01:21:43 +00:00
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=7, chunk_overlap=3)
|
2022-11-01 03:17:22 +00:00
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo bar", "bar baz", "baz 123"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-01-09 03:19:32 +00:00
|
|
|
def test_character_text_splitter_empty_doc() -> None:
|
|
|
|
"""Test splitting by character count doesn't create empty documents."""
|
|
|
|
text = "foo bar"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-03-07 23:42:28 +00:00
|
|
|
def test_character_text_splitter_separtor_empty_doc() -> None:
|
|
|
|
"""Test edge cases are separators."""
|
|
|
|
text = "f b"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["f", "b"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-12-19 01:21:43 +00:00
|
|
|
def test_character_text_splitter_long() -> None:
|
|
|
|
"""Test splitting by character count on long words."""
|
|
|
|
text = "foo bar baz a a"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar", "baz", "a a"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-01-08 23:11:10 +00:00
|
|
|
def test_character_text_splitter_short_words_first() -> None:
|
|
|
|
"""Test splitting by character count when shorter words are first."""
|
|
|
|
text = "a a foo bar baz"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["a a", "foo", "bar", "baz"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-11-01 03:17:22 +00:00
|
|
|
def test_character_text_splitter_longer_words() -> None:
|
|
|
|
"""Test splitting by characters when splits not found easily."""
|
|
|
|
text = "foo bar baz 123"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=1, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar", "baz", "123"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
|
|
|
def test_character_text_splitting_args() -> None:
|
|
|
|
"""Test invalid arguments."""
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)
|
2022-12-21 03:24:08 +00:00
|
|
|
|
|
|
|
|
2023-04-25 17:02:59 +00:00
|
|
|
def test_merge_splits() -> None:
|
|
|
|
"""Test merging splits with a given separator."""
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=9, chunk_overlap=2)
|
|
|
|
splits = ["foo", "bar", "baz"]
|
|
|
|
expected_output = ["foo bar", "baz"]
|
|
|
|
output = splitter._merge_splits(splits, separator=" ")
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-12-21 03:24:08 +00:00
|
|
|
def test_create_documents() -> None:
|
|
|
|
"""Test create documents method."""
|
|
|
|
texts = ["foo bar", "baz"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts)
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo"),
|
|
|
|
Document(page_content="bar"),
|
|
|
|
Document(page_content="baz"),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
|
|
|
|
|
|
|
|
|
|
|
def test_create_documents_with_metadata() -> None:
|
|
|
|
"""Test create documents with metadata method."""
|
|
|
|
texts = ["foo bar", "baz"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts, [{"source": "1"}, {"source": "2"}])
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo", metadata={"source": "1"}),
|
|
|
|
Document(page_content="bar", metadata={"source": "1"}),
|
|
|
|
Document(page_content="baz", metadata={"source": "2"}),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
2023-01-08 23:11:10 +00:00
|
|
|
|
|
|
|
|
2023-03-11 17:18:25 +00:00
|
|
|
def test_metadata_not_shallow() -> None:
|
|
|
|
"""Test that metadatas are not shallow."""
|
|
|
|
texts = ["foo bar"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts, [{"source": "1"}])
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo", metadata={"source": "1"}),
|
|
|
|
Document(page_content="bar", metadata={"source": "1"}),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
|
|
|
docs[0].metadata["foo"] = 1
|
|
|
|
assert docs[0].metadata == {"source": "1", "foo": 1}
|
|
|
|
assert docs[1].metadata == {"source": "1"}
|
|
|
|
|
|
|
|
|
2023-01-08 23:11:10 +00:00
|
|
|
def test_iterative_text_splitter() -> None:
|
|
|
|
"""Test iterative text splitter."""
|
|
|
|
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
|
|
|
This is a weird text to write, but gotta test the splittingggg some how.
|
|
|
|
|
|
|
|
Bye!\n\n-H."""
|
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = [
|
|
|
|
"Hi.",
|
|
|
|
"I'm",
|
|
|
|
"Harrison.",
|
|
|
|
"How? Are?",
|
|
|
|
"You?",
|
2023-03-07 23:42:28 +00:00
|
|
|
"Okay then",
|
2023-01-08 23:11:10 +00:00
|
|
|
"f f f f.",
|
|
|
|
"This is a",
|
|
|
|
"a weird",
|
|
|
|
"text to",
|
|
|
|
"write, but",
|
|
|
|
"gotta test",
|
|
|
|
"the",
|
2023-03-07 23:42:28 +00:00
|
|
|
"splittingg",
|
|
|
|
"ggg",
|
2023-01-08 23:11:10 +00:00
|
|
|
"some how.",
|
|
|
|
"Bye!\n\n-H.",
|
|
|
|
]
|
|
|
|
assert output == expected_output
|