mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Fix invalid escape sequence warnings (#8771)
<!-- Thank you for contributing to LangChain! Replace this comment with: - Description: a description of the change, - Issue: the issue # it fixes (if applicable), - Dependencies: any dependencies required for this change, - Tag maintainer: for a quicker response, tag the relevant maintainer (see below), - Twitter handle: we announce bigger features on Twitter. If your PR gets announced and you'd like a mention, we'll gladly shout you out! Please make sure you're PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. Maintainer responsibilities: - General / Misc / if you don't know who to tag: @baskaryan - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev - Models / Prompts: @hwchase17, @baskaryan - Memory: @hwchase17 - Agents / Tools / Toolkits: @hinthornw - Tracing / Callbacks: @agola11 - Async: @agola11 If no one reviews your PR within a few days, feel free to @-mention the same people again. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md --> Description: The lines I have changed looks like incorrectly escaped for regex. In python 3.11, I receive DeprecationWarning for these lines. You don't see any warnings unless you explicitly run python with `-W always::DeprecationWarning` flag. So, this is my attempt to fix it. Here are the warnings from log files: ``` /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:919: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:918: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:917: DeprecationWarning: invalid escape sequence '\s' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:916: DeprecationWarning: invalid escape sequence '\c' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:903: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' /usr/local/lib/python3.11/site-packages/langchain/text_splitter.py:804: DeprecationWarning: invalid escape sequence '\*' ``` cc @baskaryan --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
0adc282d70
commit
454998c1fb
@ -832,7 +832,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
# Split along section titles
|
||||
"\n=+\n",
|
||||
"\n-+\n",
|
||||
"\n\*+\n",
|
||||
"\n\\*+\n",
|
||||
# Split along directive markers
|
||||
"\n\n.. *\n\n",
|
||||
# Split by the normal type of lines
|
||||
@ -931,7 +931,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
# End of code block
|
||||
"```\n",
|
||||
# Horizontal lines
|
||||
"\n\*\*\*+\n",
|
||||
"\n\\*\\*\\*+\n",
|
||||
"\n---+\n",
|
||||
"\n___+\n",
|
||||
# Note that this splitter doesn't handle horizontal lines defined
|
||||
@ -944,19 +944,19 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
elif language == Language.LATEX:
|
||||
return [
|
||||
# First, try to split along Latex sections
|
||||
"\n\\\chapter{",
|
||||
"\n\\\section{",
|
||||
"\n\\\subsection{",
|
||||
"\n\\\subsubsection{",
|
||||
"\n\\\\chapter{",
|
||||
"\n\\\\section{",
|
||||
"\n\\\\subsection{",
|
||||
"\n\\\\subsubsection{",
|
||||
# Now split by environments
|
||||
"\n\\\begin{enumerate}",
|
||||
"\n\\\begin{itemize}",
|
||||
"\n\\\begin{description}",
|
||||
"\n\\\begin{list}",
|
||||
"\n\\\begin{quote}",
|
||||
"\n\\\begin{quotation}",
|
||||
"\n\\\begin{verse}",
|
||||
"\n\\\begin{verbatim}",
|
||||
"\n\\\\begin{enumerate}",
|
||||
"\n\\\\begin{itemize}",
|
||||
"\n\\\\begin{description}",
|
||||
"\n\\\\begin{list}",
|
||||
"\n\\\\begin{quote}",
|
||||
"\n\\\\begin{quotation}",
|
||||
"\n\\\\begin{verse}",
|
||||
"\n\\\\begin{verbatim}",
|
||||
# Now split by math environments
|
||||
"\n\\\begin{align}",
|
||||
"$$",
|
||||
|
@ -406,6 +406,10 @@ Not a comment
|
||||
".. This is a",
|
||||
"comment",
|
||||
]
|
||||
# Special test for special characters
|
||||
code = "harry\n***\nbabylon is"
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == ["harry", "***\nbabylon is"]
|
||||
|
||||
|
||||
def test_proto_file_splitter() -> None:
|
||||
@ -680,6 +684,22 @@ This is a code block
|
||||
"block",
|
||||
"```",
|
||||
]
|
||||
# Special test for special characters
|
||||
code = "harry\n***\nbabylon is"
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == ["harry", "***\nbabylon is"]
|
||||
|
||||
|
||||
def test_latex_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.LATEX, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
Hi Harrison!
|
||||
\\chapter{1}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == ["Hi Harrison!", "\\chapter{1}"]
|
||||
|
||||
|
||||
def test_html_code_splitter() -> None:
|
||||
|
Loading…
Reference in New Issue
Block a user