Feature/adding csharp support to textsplitter (#10350)

**Description:** Adding C# language support for
`RecursiveCharacterTextSplitter`
**Issue:**   N/A
**Dependencies:** N/A

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Syed Ather Rizvi 2023-09-08 19:01:06 -04:00 committed by GitHub
parent 3e5a143625
commit 4258c23867
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 106 additions and 0 deletions

View File

@ -627,6 +627,7 @@ class Language(str, Enum):
LATEX = "latex" LATEX = "latex"
HTML = "html" HTML = "html"
SOL = "sol" SOL = "sol"
CSHARP = "csharp"
class RecursiveCharacterTextSplitter(TextSplitter): class RecursiveCharacterTextSplitter(TextSplitter):
@ -1002,6 +1003,43 @@ class RecursiveCharacterTextSplitter(TextSplitter):
"<title", "<title",
"", "",
] ]
elif language == Language.CSHARP:
return [
"\ninterface ",
"\nenum ",
"\nimplements ",
"\ndelegate ",
"\nevent ",
# Split along class definitions
"\nclass ",
"\nabstract ",
# Split along method definitions
"\npublic ",
"\nprotected ",
"\nprivate ",
"\nstatic ",
"\nreturn ",
# Split along control flow statements
"\nif ",
"\ncontinue ",
"\nfor ",
"\nforeach ",
"\nwhile ",
"\nswitch ",
"\nbreak ",
"\ncase ",
"\nelse ",
# Split by exceptions
"\ntry ",
"\nthrow ",
"\nfinally ",
"\ncatch ",
# Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
elif language == Language.SOL: elif language == Language.SOL:
return [ return [
# Split along compiler information definitions # Split along compiler information definitions
@ -1032,6 +1070,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ", " ",
"", "",
] ]
else: else:
raise ValueError( raise ValueError(
f"Language {language} is not supported! " f"Language {language} is not supported! "

View File

@ -498,6 +498,73 @@ public class HelloWorld {
] ]
def test_csharp_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
)
code = """
using System;
class Program
{
static void Main()
{
int age = 30; // Change the age value as needed
// Categorize the age without any console output
if (age < 18)
{
// Age is under 18
}
else if (age >= 18 && age < 65)
{
// Age is an adult
}
else
{
// Age is a senior citizen
}
}
}
"""
chunks = splitter.split_text(code)
assert chunks == [
"using System;",
"class Program\n{",
"static void",
"Main()",
"{",
"int age",
"= 30; // Change",
"the age value",
"as needed",
"//",
"Categorize the",
"age without any",
"console output",
"if (age",
"< 18)",
"{",
"//",
"Age is under 18",
"}",
"else if",
"(age >= 18 &&",
"age < 65)",
"{",
"//",
"Age is an adult",
"}",
"else",
"{",
"//",
"Age is a senior",
"citizen",
"}\n }",
"}",
]
def test_cpp_code_splitter() -> None: def test_cpp_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language( splitter = RecursiveCharacterTextSplitter.from_language(
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0 Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0