Fix bug in MarkdownHeaderTextSplitter for codeblock (#10262)

- Description: The previous version of the MarkdownHeaderTextSplitter did not take into account the possibility of '#' appearing within code blocks, which caused segmentation anomalies in these situations. This PR has fixed this issue. - Issue: - Dependencies: No - Tag maintainer: - Twitter handle: cc @baskaryan @eyurtsev @rlancemartin --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
10 months ago · f1269830a0
parent 656d2303f7
commit f1269830a0
2 changed files with 20 additions and 0 deletions
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@ -390,8 +390,22 @@ class MarkdownHeaderTextSplitter:
        header_stack: List[HeaderType] = []
        initial_metadata: Dict[str, str] = {}

+        in_code_block = False
+
        for line in lines:
            stripped_line = line.strip()
+
+            if stripped_line.startswith("```"):
+                # code block in one row
+                if stripped_line.count("```") >= 2:
+                    in_code_block = False
+                else:
+                    in_code_block = not in_code_block
+
+            if in_code_block:
+                current_content.append(stripped_line)
+                continue
+
            # Check each line against each of the header types (e.g., #, ##)
            for sep, name in self.headers_to_split_on:
                # Check if line starts with a header that we intend to split on
--- a/libs/langchain/tests/unit_tests/test_text_splitter.py
+++ b/libs/langchain/tests/unit_tests/test_text_splitter.py
@ -783,6 +783,10 @@ ____________
 #### Code blocks
 ```
 This is a code block
+
+# sample code
+a = 1
+b = 2
 ```
    """
    chunks = splitter.split_text(code)
@ -808,6 +812,8 @@ This is a code block
        "```",
        "This is a code",
        "block",
+        "# sample code",
+        "a = 1\nb = 2",
        "```",
    ]
    # Special test for special characters