Fix invalid escape sequence warnings (#8771)

Description: The lines I have changed looks like incorrectly escaped for regex. In python 3.11, I receive DeprecationWarning for these lines. You don't see any warnings unless you explicitly run python with `-W always::DeprecationWarning` flag. So, this is my attempt to fix it. Here are the warnings from log files: ``` /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:919: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:918: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:917: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:916: DeprecationWarning: invalid escape sequence '\c' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:903: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' ``` cc @baskaryan --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
1 year ago · 454998c1fb
parent 0adc282d70
commit 454998c1fb
2 changed files with 34 additions and 14 deletions
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@ -832,7 +832,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                # Split along section titles
                "\n=+\n",
                "\n-+\n",
-                "\n\*+\n",
+                "\n\\*+\n",
                # Split along directive markers
                "\n\n.. *\n\n",
                # Split by the normal type of lines
@ -931,7 +931,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                # End of code block
                "```\n",
                # Horizontal lines
-                "\n\*\*\*+\n",
+                "\n\\*\\*\\*+\n",
                "\n---+\n",
                "\n___+\n",
                # Note that this splitter doesn't handle horizontal lines defined
@ -944,19 +944,19 @@ class RecursiveCharacterTextSplitter(TextSplitter):
        elif language == Language.LATEX:
            return [
                # First, try to split along Latex sections
-                "\n\\\chapter{",
-                "\n\\\section{",
-                "\n\\\subsection{",
-                "\n\\\subsubsection{",
+                "\n\\\\chapter{",
+                "\n\\\\section{",
+                "\n\\\\subsection{",
+                "\n\\\\subsubsection{",
                # Now split by environments
-                "\n\\\begin{enumerate}",
-                "\n\\\begin{itemize}",
-                "\n\\\begin{description}",
-                "\n\\\begin{list}",
-                "\n\\\begin{quote}",
-                "\n\\\begin{quotation}",
-                "\n\\\begin{verse}",
-                "\n\\\begin{verbatim}",
+                "\n\\\\begin{enumerate}",
+                "\n\\\\begin{itemize}",
+                "\n\\\\begin{description}",
+                "\n\\\\begin{list}",
+                "\n\\\\begin{quote}",
+                "\n\\\\begin{quotation}",
+                "\n\\\\begin{verse}",
+                "\n\\\\begin{verbatim}",
                # Now split by math environments
                "\n\\\begin{align}",
                "$$",
--- a/libs/langchain/tests/unit_tests/test_text_splitter.py
+++ b/libs/langchain/tests/unit_tests/test_text_splitter.py
@ -406,6 +406,10 @@ Not a comment
        ".. This is a",
        "comment",
    ]
+    # Special test for special characters
+    code = "harry\n***\nbabylon is"
+    chunks = splitter.split_text(code)
+    assert chunks == ["harry", "***\nbabylon is"]


 def test_proto_file_splitter() -> None:
@ -680,6 +684,22 @@ This is a code block
        "block",
        "```",
    ]
+    # Special test for special characters
+    code = "harry\n***\nbabylon is"
+    chunks = splitter.split_text(code)
+    assert chunks == ["harry", "***\nbabylon is"]
+
+
+def test_latex_code_splitter() -> None:
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    )
+    code = """
+Hi Harrison!
+\\chapter{1}
+"""
+    chunks = splitter.split_text(code)
+    assert chunks == ["Hi Harrison!", "\\chapter{1}"]


 def test_html_code_splitter() -> None: