text-splitters[patch]: fix MarkdownHeaderTextSplitter fails to parse headers with non-printable characters (#20645)

Description: MarkdownHeaderTextSplitter Fails to Parse Headers with non-printable characters. more #20643 The following is the official test case. Just replacing `# Foo\n\n` with `\ufeff# Foo\n\n` will cause the test case to fail. chunk metadata is empty ```python def test_md_header_text_splitter_1() -> None: """Test markdown splitter by header: Case 1.""" markdown_document = ( "\ufeff# Foo\n\n" " ## Bar\n\n" "Hi this is Jim\n\n" "Hi this is Joe\n\n" " ## Baz\n\n" " Hi this is Molly" ) headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, ) output = markdown_splitter.split_text(markdown_document) expected_output = [ Document( page_content="Hi this is Jim \nHi this is Joe", metadata={"Header 1": "Foo", "Header 2": "Bar"}, ), Document( page_content="Hi this is Molly", metadata={"Header 1": "Foo", "Header 2": "Baz"}, ), ] assert output == expected_output ``` twitter: @coolbeevip Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
3 weeks ago · 2cd907ad7e
parent 2968f20970
commit 2cd907ad7e
2 changed files with 35 additions and 1 deletions
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -107,7 +107,9 @@ class MarkdownHeaderTextSplitter:

        for line in lines:
            stripped_line = line.strip()
-
+            # Remove all non-printable characters from the string, keeping only visible
+            # text.
+            stripped_line = "".join(filter(str.isprintable, stripped_line))
            if not in_code_block:
                # Exclude inline code spans
                if stripped_line.startswith("```") and stripped_line.count("```") == 1:
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -1220,6 +1220,38 @@ def test_md_header_text_splitter_fenced_code_block_interleaved(
    assert output == expected_output


+@pytest.mark.parametrize("characters", ["\ufeff"])
+def test_md_header_text_splitter_with_invisible_characters(characters: str) -> None:
+    """Test markdown splitter by header: Fenced code block."""
+
+    markdown_document = (
+        f"{characters}# Foo\n\n" "foo()\n" f"{characters}## Bar\n\n" "bar()"
+    )
+
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+    ]
+
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+
+    expected_output = [
+        Document(
+            page_content="foo()",
+            metadata={"Header 1": "Foo"},
+        ),
+        Document(
+            page_content="bar()",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
 def test_solidity_code_splitter() -> None:
    splitter = RecursiveCharacterTextSplitter.from_language(
        Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0