chore: add kotlin code splitter (#11364)

<!-- Thank you for contributing to LangChain!

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes (if applicable),
  - **Dependencies:** any dependencies required for this change,
- **Tag maintainer:** for a quicker response, tag the relevant
maintainer (see below),
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc:

https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in `docs/extras`
directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->

- **Description:** Adds Kotlin language to `TextSplitter`

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
pull/11278/head^2
Fynn Flügge 9 months ago committed by GitHub
parent b93a08079e
commit 0a4baca291
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -17,6 +17,7 @@ from langchain.text_splitter import (
['cpp',
'go',
'java',
'kotlin',
'js',
'ts',
'php',

@ -614,6 +614,7 @@ class Language(str, Enum):
CPP = "cpp"
GO = "go"
JAVA = "java"
KOTLIN = "kotlin"
JS = "js"
TS = "ts"
PHP = "php"
@ -762,6 +763,32 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ",
"",
]
elif language == Language.KOTLIN:
return [
# Split along class definitions
"\nclass ",
# Split along method definitions
"\npublic ",
"\nprotected ",
"\nprivate ",
"\ninternal ",
"\ncompanion ",
"\nfun ",
"\nval ",
"\nvar ",
# Split along control flow statements
"\nif ",
"\nfor ",
"\nwhile ",
"\nwhen ",
"\ncase ",
"\nelse ",
# Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
elif language == Language.JS:
return [
# Split along function definitions

@ -525,6 +525,38 @@ public class HelloWorld {
]
def test_kotlin_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.KOTLIN, chunk_size=CHUNK_SIZE, chunk_overlap=0
)
code = """
class HelloWorld {
companion object {
@JvmStatic
fun main(args: Array<String>) {
println("Hello, World!")
}
}
}
"""
chunks = splitter.split_text(code)
assert chunks == [
"class",
"HelloWorld {",
"companion",
"object {",
"@JvmStatic",
"fun",
"main(args:",
"Array<String>)",
"{",
'println("Hello,',
'World!")',
"}\n }",
"}",
]
def test_csharp_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0

Loading…
Cancel
Save