text-splitters: Introduce Experimental Markdown Syntax Splitter (#22257)

#### Description This MR defines a `ExperimentalMarkdownSyntaxTextSplitter` class. The main goal is to replicate the functionality of the original `MarkdownHeaderTextSplitter` which extracts the header stack as metadata but with one critical difference: it keeps the whitespace of the original text intact. This draft reimplements the `MarkdownHeaderTextSplitter` with a very different algorithmic approach. Instead of marking up each line of the text individually and aggregating them back together into chunks, this method builds each chunk sequentially and applies the metadata to each chunk. This makes the implementation simpler. However, since it's designed to keep white space intact its not a full drop in replacement for the original. Since it is a radical implementation change to the original code and I would like to get feedback to see if this is a worthwhile replacement, should be it's own class, or is not a good idea at all. Note: I implemented the `return_each_line` parameter but I don't think it's a necessary feature. I'd prefer to remove it. This implementation also adds the following additional features: - Splits out code blocks and includes the language in the `"Code"` metadata key - Splits text on the horizontal rule `---` as well - The `headers_to_split_on` parameter is now optional - with sensible defaults that can be overridden. #### Issue Keeping the whitespace keeps the paragraphs structure and the formatting of the code blocks intact which allows the caller much more flexibility in how they want to further split the individuals sections of the resulting documents. This addresses the issues brought up by the community in the following issues: - https://github.com/langchain-ai/langchain/issues/20823 - https://github.com/langchain-ai/langchain/issues/19436 - https://github.com/langchain-ai/langchain/issues/22256 #### Dependencies N/A #### Twitter handle @RyanElston --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu>
4 months ago · 86ee4f0daa
parent 93d0ad97fe
commit 86ee4f0daa
2 changed files with 368 additions and 2 deletions
--- a/libs/text-splitters/langchain_text_splitters/markdown.py
+++ b/libs/text-splitters/langchain_text_splitters/markdown.py
@ -1,6 +1,7 @@
 from __future__ import annotations

-from typing import Any, Dict, List, Tuple, TypedDict
+import re
+from typing import Any, Dict, List, Tuple, TypedDict, Union

 from langchain_core.documents import Document

@ -221,3 +222,161 @@ class HeaderType(TypedDict):
    level: int
    name: str
    data: str
+
+
+class ExperimentalMarkdownSyntaxTextSplitter:
+    """
+    An experimental text splitter for handling Markdown syntax.
+
+    This splitter aims to retain the exact whitespace of the original text while
+    extracting structured metadata, such as headers. It is a re-implementation of the
+    MarkdownHeaderTextSplitter with notable changes to the approach and
+    additional features.
+
+    Key Features:
+    - Retains the original whitespace and formatting of the Markdown text.
+    - Extracts headers, code blocks, and horizontal rules as metadata.
+    - Splits out code blocks and includes the language in the "Code" metadata key.
+    - Splits text on horizontal rules (`---`) as well.
+    - Defaults to sensible splitting behavior, which can be overridden using the
+      `headers_to_split_on` parameter.
+
+    Parameters:
+    ----------
+    headers_to_split_on : List[Tuple[str, str]], optional
+        Headers to split on, defaulting to common Markdown headers if not specified.
+    return_each_line : bool, optional
+        When set to True, returns each line as a separate chunk. Default is False.
+
+    Usage example:
+    --------------
+    >>> headers_to_split_on = [
+    >>>     ("#", "Header 1"),
+    >>>     ("##", "Header 2"),
+    >>> ]
+    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
+    >>>     headers_to_split_on=headers_to_split_on
+    >>> )
+    >>> chunks = splitter.split(text)
+    >>> for chunk in chunks:
+    >>>     print(chunk)
+
+    This class is currently experimental and subject to change based on feedback and
+    further development.
+    """
+
+    DEFAULT_HEADER_KEYS = {
+        "#": "Header 1",
+        "##": "Header 2",
+        "###": "Header 3",
+        "####": "Header 4",
+        "#####": "Header 5",
+        "######": "Header 6",
+    }
+
+    def __init__(
+        self,
+        headers_to_split_on: Union[List[Tuple[str, str]], None] = None,
+        return_each_line: bool = False,
+        strip_headers: bool = True,
+    ):
+        self.chunks: List[Document] = []
+        self.current_chunk = Document(page_content="")
+        self.current_header_stack: List[Tuple[int, str]] = []
+        self.strip_headers = strip_headers
+        if headers_to_split_on:
+            self.splittable_headers = dict(headers_to_split_on)
+        else:
+            self.splittable_headers = self.DEFAULT_HEADER_KEYS
+
+        self.return_each_line = return_each_line
+
+    def split_text(self, text: str) -> List[Document]:
+        raw_lines = text.splitlines(keepends=True)
+
+        while raw_lines:
+            raw_line = raw_lines.pop(0)
+            header_match = self._match_header(raw_line)
+            code_match = self._match_code(raw_line)
+            horz_match = self._match_horz(raw_line)
+            if header_match:
+                self._complete_chunk_doc()
+
+                if not self.strip_headers:
+                    self.current_chunk.page_content += raw_line
+
+                # add the header to the stack
+                header_depth = len(header_match.group(1))
+                header_text = header_match.group(2)
+                self._resolve_header_stack(header_depth, header_text)
+            elif code_match:
+                self._complete_chunk_doc()
+                self.current_chunk.page_content = self._resolve_code_chunk(
+                    raw_line, raw_lines
+                )
+                self.current_chunk.metadata["Code"] = code_match.group(1)
+                self._complete_chunk_doc()
+            elif horz_match:
+                self._complete_chunk_doc()
+            else:
+                self.current_chunk.page_content += raw_line
+
+        self._complete_chunk_doc()
+        # I don't see why `return_each_line` is a necessary feature of this splitter.
+        # It's easy enough to to do outside of the class and the caller can have more
+        # control over it.
+        if self.return_each_line:
+            return [
+                Document(page_content=line, metadata=chunk.metadata)
+                for chunk in self.chunks
+                for line in chunk.page_content.splitlines()
+                if line and not line.isspace()
+            ]
+        return self.chunks
+
+    def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
+        for i, (depth, _) in enumerate(self.current_header_stack):
+            if depth == header_depth:
+                self.current_header_stack[i] = (header_depth, header_text)
+                self.current_header_stack = self.current_header_stack[: i + 1]
+                return
+        self.current_header_stack.append((header_depth, header_text))
+
+    def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
+        chunk = current_line
+        while raw_lines:
+            raw_line = raw_lines.pop(0)
+            chunk += raw_line
+            if self._match_code(raw_line):
+                return chunk
+        return ""
+
+    def _complete_chunk_doc(self) -> None:
+        chunk_content = self.current_chunk.page_content
+        # Discard any empty documents
+        if chunk_content and not chunk_content.isspace():
+            # Apply the header stack as metadata
+            for depth, value in self.current_header_stack:
+                header_key = self.splittable_headers.get("#" * depth)
+                self.current_chunk.metadata[header_key] = value
+            self.chunks.append(self.current_chunk)
+        # Reset the current chunk
+        self.current_chunk = Document(page_content="")
+
+    # Match methods
+    def _match_header(self, line: str) -> Union[re.Match, None]:
+        match = re.match(r"^(#{1,6}) (.*)", line)
+        # Only matches on the configured headers
+        if match and match.group(1) in self.splittable_headers:
+            return match
+        return None
+
+    def _match_code(self, line: str) -> Union[re.Match, None]:
+        matches = [re.match(rule, line) for rule in [r"^```(.*)", r"^~~~(.*)"]]
+        return next((match for match in matches if match), None)
+
+    def _match_horz(self, line: str) -> Union[re.Match, None]:
+        matches = [
+            re.match(rule, line) for rule in [r"^\*\*\*+\n", r"^---+\n", r"^___+\n"]
+        ]
+        return next((match for match in matches if match), None)
--- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py
+++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py
@ -19,7 +19,10 @@ from langchain_text_splitters.base import split_text_on_tokens
 from langchain_text_splitters.character import CharacterTextSplitter
 from langchain_text_splitters.html import HTMLHeaderTextSplitter, HTMLSectionSplitter
 from langchain_text_splitters.json import RecursiveJsonSplitter
-from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
+from langchain_text_splitters.markdown import (
+    ExperimentalMarkdownSyntaxTextSplitter,
+    MarkdownHeaderTextSplitter,
+)
 from langchain_text_splitters.python import PythonCodeTextSplitter

 FAKE_PYTHON_TEXT = """
@ -1296,6 +1299,210 @@ def test_md_header_text_splitter_with_invisible_characters(characters: str) -> N
    assert output == expected_output


+EXPERIMENTAL_MARKDOWN_DOCUMENT = (
+    "# My Header 1\n"
+    "Content for header 1\n"
+    "## Header 2\n"
+    "Content for header 2\n"
+    "```python\n"
+    "def func_definition():\n"
+    "   print('Keep the whitespace consistent')\n"
+    "```\n"
+    "# Header 1 again\n"
+    "We should also split on the horizontal line\n"
+    "----\n"
+    "This will be a new doc but with the same header metadata\n\n"
+    "And it includes a new paragraph"
+)
+
+
+def test_experimental_markdown_syntax_text_splitter() -> None:
+    """Test experimental markdown syntax splitter."""
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter()
+    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1\n",
+            metadata={"Header 1": "My Header 1"},
+        ),
+        Document(
+            page_content="Content for header 2\n",
+            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Header 1": "Header 1 again"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_header_configuration() -> None:
+    """Test experimental markdown syntax splitter."""
+
+    headers_to_split_on = [("#", "Encabezamiento 1")]
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(
+        headers_to_split_on=headers_to_split_on
+    )
+    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1\n## Header 2\nContent for header 2\n",
+            metadata={"Encabezamiento 1": "My Header 1"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={"Code": "python", "Encabezamiento 1": "My Header 1"},
+        ),
+        Document(
+            page_content="We should also split on the horizontal line\n",
+            metadata={"Encabezamiento 1": "Header 1 again"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Encabezamiento 1": "Header 1 again"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_with_headers() -> None:
+    """Test experimental markdown syntax splitter."""
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(strip_headers=False)
+    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
+
+    expected_output = [
+        Document(
+            page_content="# My Header 1\nContent for header 1\n",
+            metadata={"Header 1": "My Header 1"},
+        ),
+        Document(
+            page_content="## Header 2\nContent for header 2\n",
+            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
+        ),
+        Document(
+            page_content=(
+                "```python\ndef func_definition():\n   "
+                "print('Keep the whitespace consistent')\n```\n"
+            ),
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content=(
+                "# Header 1 again\nWe should also split on the horizontal line\n"
+            ),
+            metadata={"Header 1": "Header 1 again"},
+        ),
+        Document(
+            page_content=(
+                "This will be a new doc but with the same header metadata\n\n"
+                "And it includes a new paragraph"
+            ),
+            metadata={"Header 1": "Header 1 again"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
+def test_experimental_markdown_syntax_text_splitter_split_lines() -> None:
+    """Test experimental markdown syntax splitter."""
+
+    markdown_splitter = ExperimentalMarkdownSyntaxTextSplitter(return_each_line=True)
+    output = markdown_splitter.split_text(EXPERIMENTAL_MARKDOWN_DOCUMENT)
+
+    expected_output = [
+        Document(
+            page_content="Content for header 1", metadata={"Header 1": "My Header 1"}
+        ),
+        Document(
+            page_content="Content for header 2",
+            metadata={"Header 1": "My Header 1", "Header 2": "Header 2"},
+        ),
+        Document(
+            page_content="```python",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content="def func_definition():",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content="   print('Keep the whitespace consistent')",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content="```",
+            metadata={
+                "Code": "python",
+                "Header 1": "My Header 1",
+                "Header 2": "Header 2",
+            },
+        ),
+        Document(
+            page_content="We should also split on the horizontal line",
+            metadata={"Header 1": "Header 1 again"},
+        ),
+        Document(
+            page_content="This will be a new doc but with the same header metadata",
+            metadata={"Header 1": "Header 1 again"},
+        ),
+        Document(
+            page_content="And it includes a new paragraph",
+            metadata={"Header 1": "Header 1 again"},
+        ),
+    ]
+
+    assert output == expected_output
+
+
 def test_solidity_code_splitter() -> None:
    splitter = RecursiveCharacterTextSplitter.from_language(
        Language.SOL, chunk_size=CHUNK_SIZE, chunk_overlap=0