From f1269830a0fb0baa692f8738396b30413ed6b75d Mon Sep 17 00:00:00 2001 From: Haris Wang Date: Fri, 6 Oct 2023 09:34:42 +0800 Subject: [PATCH] Fix bug in MarkdownHeaderTextSplitter for codeblock (#10262) - Description: The previous version of the MarkdownHeaderTextSplitter did not take into account the possibility of '#' appearing within code blocks, which caused segmentation anomalies in these situations. This PR has fixed this issue. - Issue: - Dependencies: No - Tag maintainer: - Twitter handle: cc @baskaryan @eyurtsev @rlancemartin --------- Co-authored-by: Bagatur --- libs/langchain/langchain/text_splitter.py | 14 ++++++++++++++ .../tests/unit_tests/test_text_splitter.py | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index a9cc3b5dfe..c7ad4a2c37 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -390,8 +390,22 @@ class MarkdownHeaderTextSplitter: header_stack: List[HeaderType] = [] initial_metadata: Dict[str, str] = {} + in_code_block = False + for line in lines: stripped_line = line.strip() + + if stripped_line.startswith("```"): + # code block in one row + if stripped_line.count("```") >= 2: + in_code_block = False + else: + in_code_block = not in_code_block + + if in_code_block: + current_content.append(stripped_line) + continue + # Check each line against each of the header types (e.g., #, ##) for sep, name in self.headers_to_split_on: # Check if line starts with a header that we intend to split on diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index ccddee434a..578286cedc 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -783,6 +783,10 @@ ____________ #### Code blocks ``` This is a code block + +# sample code +a = 1 +b = 2 ``` """ chunks = splitter.split_text(code) @@ -808,6 +812,8 @@ This is a code block "```", "This is a code", "block", + "# sample code", + "a = 1\nb = 2", "```", ] # Special test for special characters