diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index bd0c560411..638f0c830d 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -832,7 +832,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): # Split along section titles "\n=+\n", "\n-+\n", - "\n\*+\n", + "\n\\*+\n", # Split along directive markers "\n\n.. *\n\n", # Split by the normal type of lines @@ -931,7 +931,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): # End of code block "```\n", # Horizontal lines - "\n\*\*\*+\n", + "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", # Note that this splitter doesn't handle horizontal lines defined @@ -944,19 +944,19 @@ class RecursiveCharacterTextSplitter(TextSplitter): elif language == Language.LATEX: return [ # First, try to split along Latex sections - "\n\\\chapter{", - "\n\\\section{", - "\n\\\subsection{", - "\n\\\subsubsection{", + "\n\\\\chapter{", + "\n\\\\section{", + "\n\\\\subsection{", + "\n\\\\subsubsection{", # Now split by environments - "\n\\\begin{enumerate}", - "\n\\\begin{itemize}", - "\n\\\begin{description}", - "\n\\\begin{list}", - "\n\\\begin{quote}", - "\n\\\begin{quotation}", - "\n\\\begin{verse}", - "\n\\\begin{verbatim}", + "\n\\\\begin{enumerate}", + "\n\\\\begin{itemize}", + "\n\\\\begin{description}", + "\n\\\\begin{list}", + "\n\\\\begin{quote}", + "\n\\\\begin{quotation}", + "\n\\\\begin{verse}", + "\n\\\\begin{verbatim}", # Now split by math environments "\n\\\begin{align}", "$$", diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 59a34d63c7..39d07b0152 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -406,6 +406,10 @@ Not a comment ".. This is a", "comment", ] + # Special test for special characters + code = "harry\n***\nbabylon is" + chunks = splitter.split_text(code) + assert chunks == ["harry", "***\nbabylon is"] def test_proto_file_splitter() -> None: @@ -680,6 +684,22 @@ This is a code block "block", "```", ] + # Special test for special characters + code = "harry\n***\nbabylon is" + chunks = splitter.split_text(code) + assert chunks == ["harry", "***\nbabylon is"] + + +def test_latex_code_splitter() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + code = """ +Hi Harrison! +\\chapter{1} +""" + chunks = splitter.split_text(code) + assert chunks == ["Hi Harrison!", "\\chapter{1}"] def test_html_code_splitter() -> None: