From 454998c1fb8abcba01014f0693af37463cc7a938 Mon Sep 17 00:00:00 2001 From: Emre Date: Mon, 7 Aug 2023 03:01:18 +0300 Subject: [PATCH] Fix invalid escape sequence warnings (#8771) Description: The lines I have changed looks like incorrectly escaped for regex. In python 3.11, I receive DeprecationWarning for these lines. You don't see any warnings unless you explicitly run python with `-W always::DeprecationWarning` flag. So, this is my attempt to fix it. Here are the warnings from log files: ``` /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:919: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:918: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:917: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:916: DeprecationWarning: invalid escape sequence '\c' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:903: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' ``` cc @baskaryan --------- Co-authored-by: Harrison Chase --- libs/langchain/langchain/text_splitter.py | 28 +++++++++---------- .../tests/unit_tests/test_text_splitter.py | 20 +++++++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index bd0c560411..638f0c830d 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -832,7 +832,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): # Split along section titles "\n=+\n", "\n-+\n", - "\n\*+\n", + "\n\\*+\n", # Split along directive markers "\n\n.. *\n\n", # Split by the normal type of lines @@ -931,7 +931,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): # End of code block "```\n", # Horizontal lines - "\n\*\*\*+\n", + "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", # Note that this splitter doesn't handle horizontal lines defined @@ -944,19 +944,19 @@ class RecursiveCharacterTextSplitter(TextSplitter): elif language == Language.LATEX: return [ # First, try to split along Latex sections - "\n\\\chapter{", - "\n\\\section{", - "\n\\\subsection{", - "\n\\\subsubsection{", + "\n\\\\chapter{", + "\n\\\\section{", + "\n\\\\subsection{", + "\n\\\\subsubsection{", # Now split by environments - "\n\\\begin{enumerate}", - "\n\\\begin{itemize}", - "\n\\\begin{description}", - "\n\\\begin{list}", - "\n\\\begin{quote}", - "\n\\\begin{quotation}", - "\n\\\begin{verse}", - "\n\\\begin{verbatim}", + "\n\\\\begin{enumerate}", + "\n\\\\begin{itemize}", + "\n\\\\begin{description}", + "\n\\\\begin{list}", + "\n\\\\begin{quote}", + "\n\\\\begin{quotation}", + "\n\\\\begin{verse}", + "\n\\\\begin{verbatim}", # Now split by math environments "\n\\\begin{align}", "$$", diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 59a34d63c7..39d07b0152 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -406,6 +406,10 @@ Not a comment ".. This is a", "comment", ] + # Special test for special characters + code = "harry\n***\nbabylon is" + chunks = splitter.split_text(code) + assert chunks == ["harry", "***\nbabylon is"] def test_proto_file_splitter() -> None: @@ -680,6 +684,22 @@ This is a code block "block", "```", ] + # Special test for special characters + code = "harry\n***\nbabylon is" + chunks = splitter.split_text(code) + assert chunks == ["harry", "***\nbabylon is"] + + +def test_latex_code_splitter() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + code = """ +Hi Harrison! +\\chapter{1} +""" + chunks = splitter.split_text(code) + assert chunks == ["Hi Harrison!", "\\chapter{1}"] def test_html_code_splitter() -> None: