mirror of
https://github.com/hwchase17/langchain
synced 2024-11-18 09:25:54 +00:00
Add more code splitters (go, rst, js, java, cpp, scala, ruby, php, swift, rust) (#5171)
As the title says, I added more code splitters. The implementation is trivial, so i don't add separate tests for each splitter. Let me know if any concerns. Fixes # (issue) https://github.com/hwchase17/langchain/issues/5170 ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: @eyurtsev @hwchase17 --------- Signed-off-by: byhsu <byhsu@linkedin.com> Co-authored-by: byhsu <byhsu@linkedin.com>
This commit is contained in:
parent
a61b7f7e7c
commit
9d658aaa5a
158
docs/modules/indexes/text_splitters/examples/code_splitter.ipynb
Normal file
158
docs/modules/indexes/text_splitters/examples/code_splitter.ipynb
Normal file
@ -0,0 +1,158 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# CodeTextSplitter\n",
|
||||
"\n",
|
||||
"CodeTextSplitter allows you to split your code with multiple language support. Import enum `Language` and specify the language. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import (\n",
|
||||
" CodeTextSplitter,\n",
|
||||
" Language,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Choose a language to use"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_splitter = CodeTextSplitter(\n",
|
||||
" language=Language.PYTHON, chunk_size=16, chunk_overlap=0\n",
|
||||
")\n",
|
||||
"js_splitter = CodeTextSplitter(\n",
|
||||
" language=Language.JS, chunk_size=16, chunk_overlap=0\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Split the code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='def', metadata={}),\n",
|
||||
" Document(page_content='hello_world():', metadata={}),\n",
|
||||
" Document(page_content='print(\"Hello,', metadata={}),\n",
|
||||
" Document(page_content='World!\")', metadata={}),\n",
|
||||
" Document(page_content='# Call the', metadata={}),\n",
|
||||
" Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='hello_world()', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"PYTHON_CODE = \"\"\"\n",
|
||||
"def hello_world():\n",
|
||||
" print(\"Hello, World!\")\n",
|
||||
"\n",
|
||||
"# Call the function\n",
|
||||
"hello_world()\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"python_docs = python_splitter.create_documents([PYTHON_CODE])\n",
|
||||
"python_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='helloWorld() {', metadata={}),\n",
|
||||
" Document(page_content='console.log(\"He', metadata={}),\n",
|
||||
" Document(page_content='llo,', metadata={}),\n",
|
||||
" Document(page_content='World!\");', metadata={}),\n",
|
||||
" Document(page_content='}', metadata={}),\n",
|
||||
" Document(page_content='// Call the', metadata={}),\n",
|
||||
" Document(page_content='function', metadata={}),\n",
|
||||
" Document(page_content='helloWorld();', metadata={})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"JS_CODE = \"\"\"\n",
|
||||
"function helloWorld() {\n",
|
||||
" console.log(\"Hello, World!\");\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"// Call the function\n",
|
||||
"helloWorld();\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"js_docs = js_splitter.create_documents([JS_CODE])\n",
|
||||
"js_docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.12"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -5,6 +5,7 @@ import copy
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
AbstractSet,
|
||||
Any,
|
||||
@ -475,3 +476,314 @@ class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class Language(str, Enum):
|
||||
CPP = "cpp"
|
||||
GO = "go"
|
||||
JAVA = "java"
|
||||
JS = "js"
|
||||
PHP = "php"
|
||||
PROTO = "proto"
|
||||
PYTHON = "python"
|
||||
RST = "rst"
|
||||
RUBY = "ruby"
|
||||
RUST = "rust"
|
||||
SCALA = "scala"
|
||||
SWIFT = "swift"
|
||||
MARKDOWN = "markdown"
|
||||
LATEX = "latex"
|
||||
|
||||
|
||||
class CodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
def __init__(self, language: Language, **kwargs: Any):
|
||||
"""
|
||||
A generic code text splitter supporting many programming languages.
|
||||
Example:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JAVA
|
||||
)
|
||||
Args:
|
||||
Language: The programming language to use
|
||||
"""
|
||||
separators = self._get_separators_for_language(language)
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
def _get_separators_for_language(self, language: Language) -> List[str]:
|
||||
if language == Language.CPP:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along function definitions
|
||||
"\nvoid ",
|
||||
"\nint ",
|
||||
"\nfloat ",
|
||||
"\ndouble ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.GO:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunc ",
|
||||
"\nvar ",
|
||||
"\nconst ",
|
||||
"\ntype ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.JAVA:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along method definitions
|
||||
"\npublic ",
|
||||
"\nprotected ",
|
||||
"\nprivate ",
|
||||
"\nstatic ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.JS:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunction ",
|
||||
"\nconst ",
|
||||
"\nlet ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
"\ndefault ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PHP:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunction ",
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nforeach ",
|
||||
"\nwhile ",
|
||||
"\ndo ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PROTO:
|
||||
return [
|
||||
# Split along message definitions
|
||||
"\nmessage ",
|
||||
# Split along service definitions
|
||||
"\nservice ",
|
||||
# Split along enum definitions
|
||||
"\nenum ",
|
||||
# Split along option definitions
|
||||
"\noption ",
|
||||
# Split along import statements
|
||||
"\nimport ",
|
||||
# Split along syntax declarations
|
||||
"\nsyntax ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PYTHON:
|
||||
return [
|
||||
# First, try to split along class definitions
|
||||
"\nclass ",
|
||||
"\ndef ",
|
||||
"\n\tdef ",
|
||||
# Now split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RST:
|
||||
return [
|
||||
# Split along section titles
|
||||
"\n===\n",
|
||||
"\n---\n",
|
||||
"\n***\n",
|
||||
# Split along directive markers
|
||||
"\n.. ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RUBY:
|
||||
return [
|
||||
# Split along method definitions
|
||||
"\ndef ",
|
||||
"\nclass ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nunless ",
|
||||
"\nwhile ",
|
||||
"\nfor ",
|
||||
"\ndo ",
|
||||
"\nbegin ",
|
||||
"\nrescue ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RUST:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfn ",
|
||||
"\nconst ",
|
||||
"\nlet ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nwhile ",
|
||||
"\nfor ",
|
||||
"\nloop ",
|
||||
"\nmatch ",
|
||||
"\nconst ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SCALA:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
"\nobject ",
|
||||
# Split along method definitions
|
||||
"\ndef ",
|
||||
"\nval ",
|
||||
"\nvar ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nmatch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SWIFT:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunc ",
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
"\nstruct ",
|
||||
"\nenum ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\ndo ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.MARKDOWN:
|
||||
return [
|
||||
# First, try to split along Markdown headings (starting with level 2)
|
||||
"\n## ",
|
||||
"\n### ",
|
||||
"\n#### ",
|
||||
"\n##### ",
|
||||
"\n###### ",
|
||||
# Note the alternative syntax for headings (below) is not handled here
|
||||
# Heading level 2
|
||||
# ---------------
|
||||
# End of code block
|
||||
"```\n\n",
|
||||
# Horizontal lines
|
||||
"\n\n***\n\n",
|
||||
"\n\n---\n\n",
|
||||
"\n\n___\n\n",
|
||||
# Note that this splitter doesn't handle horizontal lines defined
|
||||
# by *three or more* of ***, ---, or ___, but this is not handled
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.LATEX:
|
||||
return [
|
||||
# First, try to split along Latex sections
|
||||
"\n\\chapter{",
|
||||
"\n\\section{",
|
||||
"\n\\subsection{",
|
||||
"\n\\subsubsection{",
|
||||
# Now split by environments
|
||||
"\n\\begin{enumerate}",
|
||||
"\n\\begin{itemize}",
|
||||
"\n\\begin{description}",
|
||||
"\n\\begin{list}",
|
||||
"\n\\begin{quote}",
|
||||
"\n\\begin{quotation}",
|
||||
"\n\\begin{verse}",
|
||||
"\n\\begin{verbatim}",
|
||||
## Now split by math environments
|
||||
"\n\\begin{align}",
|
||||
"$$",
|
||||
"$",
|
||||
# Now split by the normal type of lines
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Language {language} is not supported! "
|
||||
f"Please choose from {list(Language)}"
|
||||
)
|
||||
|
@ -4,6 +4,8 @@ import pytest
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.text_splitter import (
|
||||
CharacterTextSplitter,
|
||||
CodeTextSplitter,
|
||||
Language,
|
||||
PythonCodeTextSplitter,
|
||||
RecursiveCharacterTextSplitter,
|
||||
)
|
||||
@ -194,3 +196,317 @@ def test_python_text_splitter() -> None:
|
||||
split_3 = """def bar():"""
|
||||
expected_splits = [split_0, split_1, split_2, split_3]
|
||||
assert splits == expected_splits
|
||||
|
||||
|
||||
CHUNK_SIZE = 16
|
||||
|
||||
|
||||
def test_python_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
def hello_world():
|
||||
print("Hello, World!")
|
||||
|
||||
# Call the function
|
||||
hello_world()
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"def",
|
||||
"hello_world():",
|
||||
'print("Hello,',
|
||||
'World!")',
|
||||
"# Call the",
|
||||
"function",
|
||||
"hello_world()",
|
||||
]
|
||||
|
||||
|
||||
def test_golang_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
package main
|
||||
|
||||
import "fmt"
|
||||
|
||||
func helloWorld() {
|
||||
fmt.Println("Hello, World!")
|
||||
}
|
||||
|
||||
func main() {
|
||||
helloWorld()
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"package main",
|
||||
'import "fmt"',
|
||||
"func",
|
||||
"helloWorld() {",
|
||||
'fmt.Println("He',
|
||||
"llo,",
|
||||
'World!")',
|
||||
"}",
|
||||
"func main() {",
|
||||
"helloWorld()",
|
||||
"}",
|
||||
]
|
||||
|
||||
|
||||
def test_rst_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
Sample Document
|
||||
===============
|
||||
|
||||
Section
|
||||
-------
|
||||
|
||||
This is the content of the section.
|
||||
|
||||
Lists
|
||||
-----
|
||||
|
||||
- Item 1
|
||||
- Item 2
|
||||
- Item 3
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"Sample Document",
|
||||
"===============",
|
||||
"Section",
|
||||
"-------",
|
||||
"This is the",
|
||||
"content of the",
|
||||
"section.",
|
||||
"Lists\n-----",
|
||||
"- Item 1",
|
||||
"- Item 2",
|
||||
"- Item 3",
|
||||
]
|
||||
|
||||
|
||||
def test_proto_file_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
syntax = "proto3";
|
||||
|
||||
package example;
|
||||
|
||||
message Person {
|
||||
string name = 1;
|
||||
int32 age = 2;
|
||||
repeated string hobbies = 3;
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"syntax =",
|
||||
'"proto3";',
|
||||
"package",
|
||||
"example;",
|
||||
"message Person",
|
||||
"{",
|
||||
"string name",
|
||||
"= 1;",
|
||||
"int32 age =",
|
||||
"2;",
|
||||
"repeated",
|
||||
"string hobbies",
|
||||
"= 3;",
|
||||
"}",
|
||||
]
|
||||
|
||||
|
||||
def test_javascript_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
function helloWorld() {
|
||||
console.log("Hello, World!");
|
||||
}
|
||||
|
||||
// Call the function
|
||||
helloWorld();
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"function",
|
||||
"helloWorld() {",
|
||||
'console.log("He',
|
||||
"llo,",
|
||||
'World!");',
|
||||
"}",
|
||||
"// Call the",
|
||||
"function",
|
||||
"helloWorld();",
|
||||
]
|
||||
|
||||
|
||||
def test_java_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
public class HelloWorld {
|
||||
public static void main(String[] args) {
|
||||
System.out.println("Hello, World!");
|
||||
}
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"public class",
|
||||
"HelloWorld {",
|
||||
"public",
|
||||
"static void",
|
||||
"main(String[]",
|
||||
"args) {",
|
||||
"System.out.prin",
|
||||
'tln("Hello,',
|
||||
'World!");',
|
||||
"}\n}",
|
||||
]
|
||||
|
||||
|
||||
def test_cpp_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
#include <iostream>
|
||||
|
||||
int main() {
|
||||
std::cout << "Hello, World!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"#include",
|
||||
"<iostream>",
|
||||
"int main() {",
|
||||
"std::cout",
|
||||
'<< "Hello,',
|
||||
'World!" <<',
|
||||
"std::endl;",
|
||||
"return 0;\n}",
|
||||
]
|
||||
|
||||
|
||||
def test_scala_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
object HelloWorld {
|
||||
def main(args: Array[String]): Unit = {
|
||||
println("Hello, World!")
|
||||
}
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"object",
|
||||
"HelloWorld {",
|
||||
"def",
|
||||
"main(args:",
|
||||
"Array[String]):",
|
||||
"Unit = {",
|
||||
'println("Hello,',
|
||||
'World!")',
|
||||
"}\n}",
|
||||
]
|
||||
|
||||
|
||||
def test_ruby_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
def hello_world
|
||||
puts "Hello, World!"
|
||||
end
|
||||
|
||||
hello_world
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"def hello_world",
|
||||
'puts "Hello,',
|
||||
'World!"',
|
||||
"end",
|
||||
"hello_world",
|
||||
]
|
||||
|
||||
|
||||
def test_php_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
<?php
|
||||
function hello_world() {
|
||||
echo "Hello, World!";
|
||||
}
|
||||
|
||||
hello_world();
|
||||
?>
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"<?php",
|
||||
"function",
|
||||
"hello_world() {",
|
||||
"echo",
|
||||
'"Hello,',
|
||||
'World!";',
|
||||
"}",
|
||||
"hello_world();",
|
||||
"?>",
|
||||
]
|
||||
|
||||
|
||||
def test_swift_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
func helloWorld() {
|
||||
print("Hello, World!")
|
||||
}
|
||||
|
||||
helloWorld()
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"func",
|
||||
"helloWorld() {",
|
||||
'print("Hello,',
|
||||
'World!")',
|
||||
"}",
|
||||
"helloWorld()",
|
||||
]
|
||||
|
||||
|
||||
def test_rust_code_splitter() -> None:
|
||||
splitter = CodeTextSplitter(
|
||||
language=Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
fn main() {
|
||||
println!("Hello, World!");
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == ["fn main() {", 'println!("Hello', ",", 'World!");', "}"]
|
||||
|
Loading…
Reference in New Issue
Block a user