2022-11-01 03:17:22 +00:00
|
|
|
"""Test text splitting functionality."""
|
2023-06-10 23:48:53 +00:00
|
|
|
from typing import List
|
|
|
|
|
2022-11-01 03:17:22 +00:00
|
|
|
import pytest
|
|
|
|
|
2022-12-21 03:24:08 +00:00
|
|
|
from langchain.docstore.document import Document
|
2023-01-08 23:11:10 +00:00
|
|
|
from langchain.text_splitter import (
|
|
|
|
CharacterTextSplitter,
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
Language,
|
2023-05-29 23:56:31 +00:00
|
|
|
PythonCodeTextSplitter,
|
2023-01-08 23:11:10 +00:00
|
|
|
RecursiveCharacterTextSplitter,
|
|
|
|
)
|
2022-11-01 03:17:22 +00:00
|
|
|
|
2023-05-29 23:56:31 +00:00
|
|
|
FAKE_PYTHON_TEXT = """
|
|
|
|
class Foo:
|
|
|
|
|
|
|
|
def bar():
|
|
|
|
|
|
|
|
|
|
|
|
def foo():
|
|
|
|
|
|
|
|
def testing_func():
|
|
|
|
|
|
|
|
def bar():
|
|
|
|
"""
|
|
|
|
|
2022-11-01 03:17:22 +00:00
|
|
|
|
|
|
|
def test_character_text_splitter() -> None:
|
|
|
|
"""Test splitting by character count."""
|
|
|
|
text = "foo bar baz 123"
|
2022-12-19 01:21:43 +00:00
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=7, chunk_overlap=3)
|
2022-11-01 03:17:22 +00:00
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo bar", "bar baz", "baz 123"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-01-09 03:19:32 +00:00
|
|
|
def test_character_text_splitter_empty_doc() -> None:
|
|
|
|
"""Test splitting by character count doesn't create empty documents."""
|
|
|
|
text = "foo bar"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-03-07 23:42:28 +00:00
|
|
|
def test_character_text_splitter_separtor_empty_doc() -> None:
|
|
|
|
"""Test edge cases are separators."""
|
|
|
|
text = "f b"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=2, chunk_overlap=0)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["f", "b"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-12-19 01:21:43 +00:00
|
|
|
def test_character_text_splitter_long() -> None:
|
|
|
|
"""Test splitting by character count on long words."""
|
|
|
|
text = "foo bar baz a a"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar", "baz", "a a"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2023-01-08 23:11:10 +00:00
|
|
|
def test_character_text_splitter_short_words_first() -> None:
|
|
|
|
"""Test splitting by character count when shorter words are first."""
|
|
|
|
text = "a a foo bar baz"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["a a", "foo", "bar", "baz"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-11-01 03:17:22 +00:00
|
|
|
def test_character_text_splitter_longer_words() -> None:
|
|
|
|
"""Test splitting by characters when splits not found easily."""
|
|
|
|
text = "foo bar baz 123"
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=1, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = ["foo", "bar", "baz", "123"]
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
|
|
|
def test_character_text_splitting_args() -> None:
|
|
|
|
"""Test invalid arguments."""
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
CharacterTextSplitter(chunk_size=2, chunk_overlap=4)
|
2022-12-21 03:24:08 +00:00
|
|
|
|
|
|
|
|
2023-04-25 17:02:59 +00:00
|
|
|
def test_merge_splits() -> None:
|
|
|
|
"""Test merging splits with a given separator."""
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=9, chunk_overlap=2)
|
|
|
|
splits = ["foo", "bar", "baz"]
|
|
|
|
expected_output = ["foo bar", "baz"]
|
|
|
|
output = splitter._merge_splits(splits, separator=" ")
|
|
|
|
assert output == expected_output
|
|
|
|
|
|
|
|
|
2022-12-21 03:24:08 +00:00
|
|
|
def test_create_documents() -> None:
|
|
|
|
"""Test create documents method."""
|
|
|
|
texts = ["foo bar", "baz"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts)
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo"),
|
|
|
|
Document(page_content="bar"),
|
|
|
|
Document(page_content="baz"),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
|
|
|
|
|
|
|
|
|
|
|
def test_create_documents_with_metadata() -> None:
|
|
|
|
"""Test create documents with metadata method."""
|
|
|
|
texts = ["foo bar", "baz"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts, [{"source": "1"}, {"source": "2"}])
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo", metadata={"source": "1"}),
|
|
|
|
Document(page_content="bar", metadata={"source": "1"}),
|
|
|
|
Document(page_content="baz", metadata={"source": "2"}),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
2023-01-08 23:11:10 +00:00
|
|
|
|
|
|
|
|
Add start index to metadata in TextSplitter (#5912)
<!--
Thank you for contributing to LangChain! Your PR will appear in our
release under the title you set. Please make sure it highlights your
valuable contribution.
Replace this with a description of the change, the issue it fixes (if
applicable), and relevant context. List any dependencies required for
this change.
After you're done, someone will review your PR. They may suggest
improvements. If no one reviews your PR within a few days, feel free to
@-mention the same people again, as notifications can get lost.
Finally, we'd love to show appreciation for your contribution - if you'd
like us to shout you out on Twitter, please also include your handle!
-->
#### Add start index to metadata in TextSplitter
- Modified method `create_documents` to track start position of each
chunk
- The `start_index` is included in the metadata if the `add_start_index`
parameter in the class constructor is set to `True`
This enables referencing back to the original document, particularly
useful when a specific chunk is retrieved.
<!-- If you're adding a new integration, please include:
1. a test for the integration - favor unit tests that does not rely on
network access.
2. an example notebook showing its use
See contribution guidelines for more information on how to write tests,
lint
etc:
https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
-->
#### Who can review?
Tag maintainers/contributors who might be interested:
@eyurtsev @agola11
<!-- For a quicker response, figure out the right person to tag with @
@hwchase17 - project lead
Tracing / Callbacks
- @agola11
Async
- @agola11
DataLoaders
- @eyurtsev
Models
- @hwchase17
- @agola11
Agents / Tools / Toolkits
- @vowelparrot
VectorStores / Retrievers / Memory
- @dev2049
-->
2023-06-09 06:09:32 +00:00
|
|
|
def test_create_documents_with_start_index() -> None:
|
|
|
|
"""Test create documents method."""
|
|
|
|
texts = ["foo bar baz 123"]
|
|
|
|
splitter = CharacterTextSplitter(
|
|
|
|
separator=" ", chunk_size=7, chunk_overlap=3, add_start_index=True
|
|
|
|
)
|
|
|
|
docs = splitter.create_documents(texts)
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo bar", metadata={"start_index": 0}),
|
|
|
|
Document(page_content="bar baz", metadata={"start_index": 4}),
|
|
|
|
Document(page_content="baz 123", metadata={"start_index": 8}),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
|
|
|
|
|
|
|
|
2023-03-11 17:18:25 +00:00
|
|
|
def test_metadata_not_shallow() -> None:
|
|
|
|
"""Test that metadatas are not shallow."""
|
|
|
|
texts = ["foo bar"]
|
|
|
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
|
|
|
docs = splitter.create_documents(texts, [{"source": "1"}])
|
|
|
|
expected_docs = [
|
|
|
|
Document(page_content="foo", metadata={"source": "1"}),
|
|
|
|
Document(page_content="bar", metadata={"source": "1"}),
|
|
|
|
]
|
|
|
|
assert docs == expected_docs
|
|
|
|
docs[0].metadata["foo"] = 1
|
|
|
|
assert docs[0].metadata == {"source": "1", "foo": 1}
|
|
|
|
assert docs[1].metadata == {"source": "1"}
|
|
|
|
|
|
|
|
|
2023-06-10 23:48:53 +00:00
|
|
|
def test_iterative_text_splitter_keep_separator() -> None:
|
|
|
|
chunk_size = 5
|
|
|
|
output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=True)
|
|
|
|
|
|
|
|
assert output == [
|
|
|
|
"....5",
|
|
|
|
"X..3",
|
|
|
|
"Y...4",
|
|
|
|
"X....5",
|
|
|
|
"Y...",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_iterative_text_splitter_discard_separator() -> None:
|
|
|
|
chunk_size = 5
|
|
|
|
output = __test_iterative_text_splitter(chunk_size=chunk_size, keep_separator=False)
|
|
|
|
|
|
|
|
assert output == [
|
|
|
|
"....5",
|
|
|
|
"..3",
|
|
|
|
"...4",
|
|
|
|
"....5",
|
|
|
|
"...",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def __test_iterative_text_splitter(chunk_size: int, keep_separator: bool) -> List[str]:
|
|
|
|
chunk_size += 1 if keep_separator else 0
|
|
|
|
|
|
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
|
|
chunk_size=chunk_size,
|
|
|
|
chunk_overlap=0,
|
|
|
|
separators=["X", "Y"],
|
|
|
|
keep_separator=keep_separator,
|
|
|
|
)
|
|
|
|
text = "....5X..3Y...4X....5Y..."
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
for chunk in output:
|
|
|
|
assert len(chunk) <= chunk_size, f"Chunk is larger than {chunk_size}"
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
2023-01-08 23:11:10 +00:00
|
|
|
def test_iterative_text_splitter() -> None:
|
|
|
|
"""Test iterative text splitter."""
|
|
|
|
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
|
|
|
This is a weird text to write, but gotta test the splittingggg some how.
|
|
|
|
|
|
|
|
Bye!\n\n-H."""
|
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=1)
|
|
|
|
output = splitter.split_text(text)
|
|
|
|
expected_output = [
|
|
|
|
"Hi.",
|
|
|
|
"I'm",
|
|
|
|
"Harrison.",
|
|
|
|
"How? Are?",
|
|
|
|
"You?",
|
2023-03-07 23:42:28 +00:00
|
|
|
"Okay then",
|
2023-01-08 23:11:10 +00:00
|
|
|
"f f f f.",
|
|
|
|
"This is a",
|
2023-05-29 23:56:31 +00:00
|
|
|
"weird",
|
2023-01-08 23:11:10 +00:00
|
|
|
"text to",
|
2023-05-29 23:56:31 +00:00
|
|
|
"write,",
|
|
|
|
"but gotta",
|
|
|
|
"test the",
|
|
|
|
"splitting",
|
|
|
|
"gggg",
|
2023-01-08 23:11:10 +00:00
|
|
|
"some how.",
|
2023-05-29 23:56:31 +00:00
|
|
|
"Bye!",
|
|
|
|
"-H.",
|
2023-01-08 23:11:10 +00:00
|
|
|
]
|
|
|
|
assert output == expected_output
|
2023-05-23 03:00:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_split_documents() -> None:
|
|
|
|
"""Test split_documents."""
|
|
|
|
splitter = CharacterTextSplitter(separator="", chunk_size=1, chunk_overlap=0)
|
|
|
|
docs = [
|
|
|
|
Document(page_content="foo", metadata={"source": "1"}),
|
|
|
|
Document(page_content="bar", metadata={"source": "2"}),
|
|
|
|
Document(page_content="baz", metadata={"source": "1"}),
|
|
|
|
]
|
|
|
|
expected_output = [
|
|
|
|
Document(page_content="f", metadata={"source": "1"}),
|
|
|
|
Document(page_content="o", metadata={"source": "1"}),
|
|
|
|
Document(page_content="o", metadata={"source": "1"}),
|
|
|
|
Document(page_content="b", metadata={"source": "2"}),
|
|
|
|
Document(page_content="a", metadata={"source": "2"}),
|
|
|
|
Document(page_content="r", metadata={"source": "2"}),
|
|
|
|
Document(page_content="b", metadata={"source": "1"}),
|
|
|
|
Document(page_content="a", metadata={"source": "1"}),
|
|
|
|
Document(page_content="z", metadata={"source": "1"}),
|
|
|
|
]
|
|
|
|
assert splitter.split_documents(docs) == expected_output
|
2023-05-29 23:56:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_python_text_splitter() -> None:
|
|
|
|
splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)
|
|
|
|
splits = splitter.split_text(FAKE_PYTHON_TEXT)
|
|
|
|
split_0 = """class Foo:\n\n def bar():"""
|
|
|
|
split_1 = """def foo():"""
|
|
|
|
split_2 = """def testing_func():"""
|
|
|
|
split_3 = """def bar():"""
|
|
|
|
expected_splits = [split_0, split_1, split_2, split_3]
|
|
|
|
assert splits == expected_splits
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
CHUNK_SIZE = 16
|
|
|
|
|
|
|
|
|
|
|
|
def test_python_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
def hello_world():
|
|
|
|
print("Hello, World!")
|
|
|
|
|
|
|
|
# Call the function
|
|
|
|
hello_world()
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"def",
|
|
|
|
"hello_world():",
|
|
|
|
'print("Hello,',
|
|
|
|
'World!")',
|
|
|
|
"# Call the",
|
|
|
|
"function",
|
|
|
|
"hello_world()",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_golang_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
package main
|
|
|
|
|
|
|
|
import "fmt"
|
|
|
|
|
|
|
|
func helloWorld() {
|
|
|
|
fmt.Println("Hello, World!")
|
|
|
|
}
|
|
|
|
|
|
|
|
func main() {
|
|
|
|
helloWorld()
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"package main",
|
|
|
|
'import "fmt"',
|
|
|
|
"func",
|
|
|
|
"helloWorld() {",
|
|
|
|
'fmt.Println("He',
|
|
|
|
"llo,",
|
|
|
|
'World!")',
|
|
|
|
"}",
|
|
|
|
"func main() {",
|
|
|
|
"helloWorld()",
|
|
|
|
"}",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_rst_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
Sample Document
|
|
|
|
===============
|
|
|
|
|
|
|
|
Section
|
|
|
|
-------
|
|
|
|
|
|
|
|
This is the content of the section.
|
|
|
|
|
|
|
|
Lists
|
|
|
|
-----
|
|
|
|
|
|
|
|
- Item 1
|
|
|
|
- Item 2
|
|
|
|
- Item 3
|
2023-06-05 23:40:26 +00:00
|
|
|
|
|
|
|
Comment
|
|
|
|
*******
|
|
|
|
Not a comment
|
|
|
|
|
|
|
|
.. This is a comment
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"Sample Document",
|
|
|
|
"===============",
|
|
|
|
"Section",
|
|
|
|
"-------",
|
|
|
|
"This is the",
|
|
|
|
"content of the",
|
|
|
|
"section.",
|
2023-06-05 23:40:26 +00:00
|
|
|
"Lists",
|
|
|
|
"-----",
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
"- Item 1",
|
|
|
|
"- Item 2",
|
|
|
|
"- Item 3",
|
2023-06-05 23:40:26 +00:00
|
|
|
"Comment",
|
|
|
|
"*******",
|
|
|
|
"Not a comment",
|
|
|
|
".. This is a",
|
|
|
|
"comment",
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_proto_file_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
syntax = "proto3";
|
|
|
|
|
|
|
|
package example;
|
|
|
|
|
|
|
|
message Person {
|
|
|
|
string name = 1;
|
|
|
|
int32 age = 2;
|
|
|
|
repeated string hobbies = 3;
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"syntax =",
|
|
|
|
'"proto3";',
|
|
|
|
"package",
|
|
|
|
"example;",
|
|
|
|
"message Person",
|
|
|
|
"{",
|
|
|
|
"string name",
|
|
|
|
"= 1;",
|
|
|
|
"int32 age =",
|
|
|
|
"2;",
|
|
|
|
"repeated",
|
|
|
|
"string hobbies",
|
|
|
|
"= 3;",
|
|
|
|
"}",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_javascript_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
function helloWorld() {
|
|
|
|
console.log("Hello, World!");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Call the function
|
|
|
|
helloWorld();
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"function",
|
|
|
|
"helloWorld() {",
|
|
|
|
'console.log("He',
|
|
|
|
"llo,",
|
|
|
|
'World!");',
|
|
|
|
"}",
|
|
|
|
"// Call the",
|
|
|
|
"function",
|
|
|
|
"helloWorld();",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_java_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
public class HelloWorld {
|
|
|
|
public static void main(String[] args) {
|
|
|
|
System.out.println("Hello, World!");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"public class",
|
|
|
|
"HelloWorld {",
|
|
|
|
"public",
|
|
|
|
"static void",
|
|
|
|
"main(String[]",
|
|
|
|
"args) {",
|
|
|
|
"System.out.prin",
|
|
|
|
'tln("Hello,',
|
|
|
|
'World!");',
|
|
|
|
"}\n}",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_cpp_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
#include <iostream>
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
std::cout << "Hello, World!" << std::endl;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"#include",
|
|
|
|
"<iostream>",
|
|
|
|
"int main() {",
|
|
|
|
"std::cout",
|
|
|
|
'<< "Hello,',
|
|
|
|
'World!" <<',
|
|
|
|
"std::endl;",
|
|
|
|
"return 0;\n}",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_scala_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
object HelloWorld {
|
|
|
|
def main(args: Array[String]): Unit = {
|
|
|
|
println("Hello, World!")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"object",
|
|
|
|
"HelloWorld {",
|
|
|
|
"def",
|
|
|
|
"main(args:",
|
|
|
|
"Array[String]):",
|
|
|
|
"Unit = {",
|
|
|
|
'println("Hello,',
|
|
|
|
'World!")',
|
|
|
|
"}\n}",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_ruby_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
def hello_world
|
|
|
|
puts "Hello, World!"
|
|
|
|
end
|
|
|
|
|
|
|
|
hello_world
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"def hello_world",
|
|
|
|
'puts "Hello,',
|
|
|
|
'World!"',
|
|
|
|
"end",
|
|
|
|
"hello_world",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_php_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
<?php
|
|
|
|
function hello_world() {
|
|
|
|
echo "Hello, World!";
|
|
|
|
}
|
|
|
|
|
|
|
|
hello_world();
|
|
|
|
?>
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"<?php",
|
|
|
|
"function",
|
|
|
|
"hello_world() {",
|
|
|
|
"echo",
|
|
|
|
'"Hello,',
|
|
|
|
'World!";',
|
|
|
|
"}",
|
|
|
|
"hello_world();",
|
|
|
|
"?>",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_swift_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
func helloWorld() {
|
|
|
|
print("Hello, World!")
|
|
|
|
}
|
|
|
|
|
|
|
|
helloWorld()
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"func",
|
|
|
|
"helloWorld() {",
|
|
|
|
'print("Hello,',
|
|
|
|
'World!")',
|
|
|
|
"}",
|
|
|
|
"helloWorld()",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_rust_code_splitter() -> None:
|
2023-05-31 14:11:53 +00:00
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters.
The implementation is trivial, so i don't add separate tests for each
splitter.
Let me know if any concerns.
Fixes # (issue)
https://github.com/hwchase17/langchain/issues/5170
## Who can review?
Community members can review the PR once tests pass. Tag
maintainers/contributors who might be interested:
@eyurtsev @hwchase17
---------
Signed-off-by: byhsu <byhsu@linkedin.com>
Co-authored-by: byhsu <byhsu@linkedin.com>
2023-05-30 15:04:05 +00:00
|
|
|
)
|
|
|
|
code = """
|
|
|
|
fn main() {
|
|
|
|
println!("Hello, World!");
|
|
|
|
}
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == ["fn main() {", 'println!("Hello', ",", 'World!");', "}"]
|
2023-06-05 23:40:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_markdown_code_splitter() -> None:
|
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.MARKDOWN, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
|
|
|
)
|
|
|
|
code = """
|
|
|
|
# Sample Document
|
|
|
|
|
|
|
|
## Section
|
|
|
|
|
|
|
|
This is the content of the section.
|
|
|
|
|
|
|
|
## Lists
|
|
|
|
|
|
|
|
- Item 1
|
|
|
|
- Item 2
|
|
|
|
- Item 3
|
|
|
|
|
|
|
|
### Horizontal lines
|
|
|
|
|
|
|
|
***********
|
|
|
|
____________
|
|
|
|
-------------------
|
|
|
|
|
|
|
|
#### Code blocks
|
|
|
|
```
|
|
|
|
This is a code block
|
|
|
|
```
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"# Sample",
|
|
|
|
"Document",
|
|
|
|
"## Section",
|
|
|
|
"This is the",
|
|
|
|
"content of the",
|
|
|
|
"section.",
|
|
|
|
"## Lists",
|
|
|
|
"- Item 1",
|
|
|
|
"- Item 2",
|
|
|
|
"- Item 3",
|
|
|
|
"### Horizontal",
|
|
|
|
"lines",
|
|
|
|
"***********",
|
|
|
|
"____________",
|
|
|
|
"---------------",
|
|
|
|
"----",
|
|
|
|
"#### Code",
|
|
|
|
"blocks",
|
|
|
|
"```",
|
|
|
|
"This is a code",
|
|
|
|
"block",
|
|
|
|
"```",
|
|
|
|
]
|
2023-06-06 16:27:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_html_code_splitter() -> None:
|
|
|
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
|
|
Language.HTML, chunk_size=60, chunk_overlap=0
|
|
|
|
)
|
|
|
|
code = """
|
|
|
|
<h1>Sample Document</h1>
|
|
|
|
<h2>Section</h2>
|
|
|
|
<p id="1234">Reference content.</p>
|
|
|
|
|
|
|
|
<h2>Lists</h2>
|
|
|
|
<ul>
|
|
|
|
<li>Item 1</li>
|
|
|
|
<li>Item 2</li>
|
|
|
|
<li>Item 3</li>
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
<h3>A block</h3>
|
|
|
|
<div class="amazing">
|
|
|
|
<p>Some text</p>
|
|
|
|
<p>Some more text</p>
|
|
|
|
</div>
|
|
|
|
"""
|
|
|
|
chunks = splitter.split_text(code)
|
|
|
|
assert chunks == [
|
|
|
|
"<h1>Sample Document</h1>\n <h2>Section</h2>",
|
|
|
|
'<p id="1234">Reference content.</p>',
|
|
|
|
"<h2>Lists</h2>\n <ul>",
|
|
|
|
"<li>Item 1</li>\n <li>Item 2</li>",
|
|
|
|
"<li>Item 3</li>\n </ul>",
|
|
|
|
"<h3>A block</h3>",
|
|
|
|
'<div class="amazing">',
|
|
|
|
"<p>Some text</p>",
|
|
|
|
"<p>Some more text</p>\n </div>",
|
|
|
|
]
|