mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Feature/adding csharp support to textsplitter (#10350)
**Description:** Adding C# language support for `RecursiveCharacterTextSplitter` **Issue:** N/A **Dependencies:** N/A --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
3e5a143625
commit
4258c23867
@ -627,6 +627,7 @@ class Language(str, Enum):
|
|||||||
LATEX = "latex"
|
LATEX = "latex"
|
||||||
HTML = "html"
|
HTML = "html"
|
||||||
SOL = "sol"
|
SOL = "sol"
|
||||||
|
CSHARP = "csharp"
|
||||||
|
|
||||||
|
|
||||||
class RecursiveCharacterTextSplitter(TextSplitter):
|
class RecursiveCharacterTextSplitter(TextSplitter):
|
||||||
@ -1002,6 +1003,43 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
"<title",
|
"<title",
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
|
elif language == Language.CSHARP:
|
||||||
|
return [
|
||||||
|
"\ninterface ",
|
||||||
|
"\nenum ",
|
||||||
|
"\nimplements ",
|
||||||
|
"\ndelegate ",
|
||||||
|
"\nevent ",
|
||||||
|
# Split along class definitions
|
||||||
|
"\nclass ",
|
||||||
|
"\nabstract ",
|
||||||
|
# Split along method definitions
|
||||||
|
"\npublic ",
|
||||||
|
"\nprotected ",
|
||||||
|
"\nprivate ",
|
||||||
|
"\nstatic ",
|
||||||
|
"\nreturn ",
|
||||||
|
# Split along control flow statements
|
||||||
|
"\nif ",
|
||||||
|
"\ncontinue ",
|
||||||
|
"\nfor ",
|
||||||
|
"\nforeach ",
|
||||||
|
"\nwhile ",
|
||||||
|
"\nswitch ",
|
||||||
|
"\nbreak ",
|
||||||
|
"\ncase ",
|
||||||
|
"\nelse ",
|
||||||
|
# Split by exceptions
|
||||||
|
"\ntry ",
|
||||||
|
"\nthrow ",
|
||||||
|
"\nfinally ",
|
||||||
|
"\ncatch ",
|
||||||
|
# Split by the normal type of lines
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
elif language == Language.SOL:
|
elif language == Language.SOL:
|
||||||
return [
|
return [
|
||||||
# Split along compiler information definitions
|
# Split along compiler information definitions
|
||||||
@ -1032,6 +1070,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
" ",
|
" ",
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Language {language} is not supported! "
|
f"Language {language} is not supported! "
|
||||||
|
@ -498,6 +498,73 @@ public class HelloWorld {
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_csharp_code_splitter() -> None:
|
||||||
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
|
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||||
|
)
|
||||||
|
code = """
|
||||||
|
using System;
|
||||||
|
class Program
|
||||||
|
{
|
||||||
|
static void Main()
|
||||||
|
{
|
||||||
|
int age = 30; // Change the age value as needed
|
||||||
|
|
||||||
|
// Categorize the age without any console output
|
||||||
|
if (age < 18)
|
||||||
|
{
|
||||||
|
// Age is under 18
|
||||||
|
}
|
||||||
|
else if (age >= 18 && age < 65)
|
||||||
|
{
|
||||||
|
// Age is an adult
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Age is a senior citizen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
chunks = splitter.split_text(code)
|
||||||
|
assert chunks == [
|
||||||
|
"using System;",
|
||||||
|
"class Program\n{",
|
||||||
|
"static void",
|
||||||
|
"Main()",
|
||||||
|
"{",
|
||||||
|
"int age",
|
||||||
|
"= 30; // Change",
|
||||||
|
"the age value",
|
||||||
|
"as needed",
|
||||||
|
"//",
|
||||||
|
"Categorize the",
|
||||||
|
"age without any",
|
||||||
|
"console output",
|
||||||
|
"if (age",
|
||||||
|
"< 18)",
|
||||||
|
"{",
|
||||||
|
"//",
|
||||||
|
"Age is under 18",
|
||||||
|
"}",
|
||||||
|
"else if",
|
||||||
|
"(age >= 18 &&",
|
||||||
|
"age < 65)",
|
||||||
|
"{",
|
||||||
|
"//",
|
||||||
|
"Age is an adult",
|
||||||
|
"}",
|
||||||
|
"else",
|
||||||
|
"{",
|
||||||
|
"//",
|
||||||
|
"Age is a senior",
|
||||||
|
"citizen",
|
||||||
|
"}\n }",
|
||||||
|
"}",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_cpp_code_splitter() -> None:
|
def test_cpp_code_splitter() -> None:
|
||||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||||
|
Loading…
Reference in New Issue
Block a user