From 454998c1fb8abcba01014f0693af37463cc7a938 Mon Sep 17 00:00:00 2001
From: Emre <aemr3@users.noreply.github.com>
Date: Mon, 7 Aug 2023 03:01:18 +0300
Subject: [PATCH] Fix invalid escape sequence warnings (#8771)

<!-- Thank you for contributing to LangChain!

Replace this comment with:
  - Description: a description of the change,
  - Issue: the issue # it fixes (if applicable),
  - Dependencies: any dependencies required for this change,
- Tag maintainer: for a quicker response, tag the relevant maintainer
(see below),
- Twitter handle: we announce bigger features on Twitter. If your PR
gets announced and you'd like a mention, we'll gladly shout you out!

Please make sure you're PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` to check this
locally.

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
  2. an example notebook showing its use.

Maintainer responsibilities:
  - General / Misc / if you don't know who to tag: @baskaryan
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
  - Models / Prompts: @hwchase17, @baskaryan
  - Memory: @hwchase17
  - Agents / Tools / Toolkits: @hinthornw
  - Tracing / Callbacks: @agola11
  - Async: @agola11

If no one reviews your PR within a few days, feel free to @-mention the
same people again.

See contribution guidelines for more information on how to write/run
tests, lint, etc:
https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md
 -->

Description: The lines I have changed looks like incorrectly escaped for
regex. In python 3.11, I receive DeprecationWarning for these lines.
You don't see any warnings unless you explicitly run python with `-W
always::DeprecationWarning` flag. So, this is my attempt to fix it.

Here are the warnings from log files:

```
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:919: DeprecationWarning: invalid escape sequence '\s'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:918: DeprecationWarning: invalid escape sequence '\s'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:917: DeprecationWarning: invalid escape sequence '\s'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:916: DeprecationWarning: invalid escape sequence '\c'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:903: DeprecationWarning: invalid escape sequence '\*'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*'
/usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*'
```

cc @baskaryan

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
---
 libs/langchain/langchain/text_splitter.py     | 28 +++++++++----------
 .../tests/unit_tests/test_text_splitter.py    | 20 +++++++++++++
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py
index bd0c560411..638f0c830d 100644
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@@ -832,7 +832,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                 # Split along section titles
                 "\n=+\n",
                 "\n-+\n",
-                "\n\*+\n",
+                "\n\\*+\n",
                 # Split along directive markers
                 "\n\n.. *\n\n",
                 # Split by the normal type of lines
@@ -931,7 +931,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
                 # End of code block
                 "```\n",
                 # Horizontal lines
-                "\n\*\*\*+\n",
+                "\n\\*\\*\\*+\n",
                 "\n---+\n",
                 "\n___+\n",
                 # Note that this splitter doesn't handle horizontal lines defined
@@ -944,19 +944,19 @@ class RecursiveCharacterTextSplitter(TextSplitter):
         elif language == Language.LATEX:
             return [
                 # First, try to split along Latex sections
-                "\n\\\chapter{",
-                "\n\\\section{",
-                "\n\\\subsection{",
-                "\n\\\subsubsection{",
+                "\n\\\\chapter{",
+                "\n\\\\section{",
+                "\n\\\\subsection{",
+                "\n\\\\subsubsection{",
                 # Now split by environments
-                "\n\\\begin{enumerate}",
-                "\n\\\begin{itemize}",
-                "\n\\\begin{description}",
-                "\n\\\begin{list}",
-                "\n\\\begin{quote}",
-                "\n\\\begin{quotation}",
-                "\n\\\begin{verse}",
-                "\n\\\begin{verbatim}",
+                "\n\\\\begin{enumerate}",
+                "\n\\\\begin{itemize}",
+                "\n\\\\begin{description}",
+                "\n\\\\begin{list}",
+                "\n\\\\begin{quote}",
+                "\n\\\\begin{quotation}",
+                "\n\\\\begin{verse}",
+                "\n\\\\begin{verbatim}",
                 # Now split by math environments
                 "\n\\\begin{align}",
                 "$$",
diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py
index 59a34d63c7..39d07b0152 100644
--- a/libs/langchain/tests/unit_tests/test_text_splitter.py
+++ b/libs/langchain/tests/unit_tests/test_text_splitter.py
@@ -406,6 +406,10 @@ Not a comment
         ".. This is a",
         "comment",
     ]
+    # Special test for special characters
+    code = "harry\n***\nbabylon is"
+    chunks = splitter.split_text(code)
+    assert chunks == ["harry", "***\nbabylon is"]
 
 
 def test_proto_file_splitter() -> None:
@@ -680,6 +684,22 @@ This is a code block
         "block",
         "```",
     ]
+    # Special test for special characters
+    code = "harry\n***\nbabylon is"
+    chunks = splitter.split_text(code)
+    assert chunks == ["harry", "***\nbabylon is"]
+
+
+def test_latex_code_splitter() -> None:
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    )
+    code = """
+Hi Harrison!
+\\chapter{1}
+"""
+    chunks = splitter.split_text(code)
+    assert chunks == ["Hi Harrison!", "\\chapter{1}"]
 
 
 def test_html_code_splitter() -> None: