langchain/libs/text-splitters/langchain_text_splitters/markdown.py

from __future__ import annotations

import re
from typing import Any, Dict, List, Tuple, TypedDict, Union

from langchain_core.documents import Document

from langchain_text_splitters.base import Language
from langchain_text_splitters.character import RecursiveCharacterTextSplitter


class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
    """Attempts to split the text along Markdown-formatted headings."""

    def __init__(self, **kwargs: Any) -> None:
        """Initialize a MarkdownTextSplitter."""
        separators = self.get_separators_for_language(Language.MARKDOWN)
        super().__init__(separators=separators, **kwargs)


class MarkdownHeaderTextSplitter:
    """Splitting markdown files based on specified headers."""

    def __init__(
        self,
        headers_to_split_on: List[Tuple[str, str]],
        return_each_line: bool = False,
        strip_headers: bool = True,
    ):
        """Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        """
        # Output line-by-line or aggregated into chunks w/ common headers
        self.return_each_line = return_each_line
        # Given the headers we want to split on,
        # (e.g., "#, ##, etc") order by length
        self.headers_to_split_on = sorted(
            headers_to_split_on, key=lambda split: len(split[0]), reverse=True
        )
        # Strip headers split headers from the content of the chunk
        self.strip_headers = strip_headers

    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
        """Combine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        """
        aggregated_chunks: List[LineType] = []

        for line in lines:
            if (
                aggregated_chunks
                and aggregated_chunks[-1]["metadata"] == line["metadata"]
            ):
                # If the last line in the aggregated list
                # has the same metadata as the current line,
                # append the current content to the last lines's content
                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
            elif (
                aggregated_chunks
                and aggregated_chunks[-1]["metadata"] != line["metadata"]
                # may be issues if other metadata is present
                and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
                and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
                and not self.strip_headers
            ):
                # If the last line in the aggregated list
                # has different metadata as the current line,
                # and has shallower header level than the current line,
                # and the last line is a header,
                # and we are not stripping headers,
                # append the current content to the last line's content
                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
                # and update the last line's metadata
                aggregated_chunks[-1]["metadata"] = line["metadata"]
            else:
                # Otherwise, append the current line to the aggregated list
                aggregated_chunks.append(line)

        return [
            Document(page_content=chunk["content"], metadata=chunk["metadata"])
            for chunk in aggregated_chunks
        ]

    def split_text(self, text: str) -> List[Document]:
        """Split markdown file
        Args:
            text: Markdown file"""

        # Split the input text by newline character ("\n").
        lines = text.split("\n")
        # Final output
        lines_with_metadata: List[LineType] = []
        # Content and metadata of the chunk currently being processed
        current_content: List[str] = []
        current_metadata: Dict[str, str] = {}
        # Keep track of the nested header structure
        # header_stack: List[Dict[str, Union[int, str]]] = []
        header_stack: List[HeaderType] = []
        initial_metadata: Dict[str, str] = {}

        in_code_block = False
        opening_fence = ""

        for line in lines:
            stripped_line = line.strip()
            # Remove all non-printable characters from the string, keeping only visible
            # text.
            stripped_line = "".join(filter(str.isprintable, stripped_line))
            if not in_code_block:
                # Exclude inline code spans
                if stripped_line.startswith("```") and stripped_line.count("```") == 1:
                    in_code_block = True
                    opening_fence = "```"
                elif stripped_line.startswith("~~~"):
                    in_code_block = True
                    opening_fence = "~~~"
            else:
                if stripped_line.startswith(opening_fence):
                    in_code_block = False
                    opening_fence = ""

            if in_code_block:
                current_content.append(stripped_line)
                continue

            # Check each line against each of the header types (e.g., #, ##)
            for sep, name in self.headers_to_split_on:
                # Check if line starts with a header that we intend to split on
                if stripped_line.startswith(sep) and (
                    # Header with no text OR header is followed by space
                    # Both are valid conditions that sep is being used a header
                    len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "
                ):
                    # Ensure we are tracking the header as metadata
                    if name is not None:
                        # Get the current header level
                        current_header_level = sep.count("#")

                        # Pop out headers of lower or same level from the stack
                        while (
                            header_stack
                            and header_stack[-1]["level"] >= current_header_level
                        ):
                            # We have encountered a new header
                            # at the same or higher level
                            popped_header = header_stack.pop()
                            # Clear the metadata for the
                            # popped header in initial_metadata
                            if popped_header["name"] in initial_metadata:
                                initial_metadata.pop(popped_header["name"])

                        # Push the current header to the stack
                        header: HeaderType = {
                            "level": current_header_level,
                            "name": name,
                            "data": stripped_line[len(sep) :].strip(),
                        }
                        header_stack.append(header)
                        # Update initial_metadata with the current header
                        initial_metadata[name] = header["data"]

                    # Add the previous line to the lines_with_metadata
                    # only if current_content is not empty
                    if current_content:
                        lines_with_metadata.append(
                            {
                                "content": "\n".join(current_content),
                                "metadata": current_metadata.copy(),
                            }
                        )
                        current_content.clear()

                    if not self.strip_headers:
                        current_content.append(stripped_line)

                    break
            else:
                if stripped_line:
                    current_content.append(stripped_line)
                elif current_content:
                    lines_with_metadata.append(
                        {
                            "content": "\n".join(current_content),
                            "metadata": current_metadata.copy(),
                        }
                    )
                    current_content.clear()

            current_metadata = initial_metadata.copy()

        if current_content:
            lines_with_metadata.append(
                {"content": "\n".join(current_content), "metadata": current_metadata}
            )

        # lines_with_metadata has each line with associated header metadata
        # aggregate these into chunks based on common metadata
        if not self.return_each_line:
            return self.aggregate_lines_to_chunks(lines_with_metadata)
        else:
            return [
                Document(page_content=chunk["content"], metadata=chunk["metadata"])
                for chunk in lines_with_metadata
            ]


class LineType(TypedDict):
    """Line type as typed dict."""

    metadata: Dict[str, str]
    content: str


class HeaderType(TypedDict):
    """Header type as typed dict."""

    level: int
    name: str
    data: str


class ExperimentalMarkdownSyntaxTextSplitter:
    """
    An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    MarkdownHeaderTextSplitter with notable changes to the approach and
    additional features.

    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.

    Parameters:
    ----------
    headers_to_split_on : List[Tuple[str, str]], optional
        Headers to split on, defaulting to common Markdown headers if not specified.
    return_each_line : bool, optional
        When set to True, returns each line as a separate chunk. Default is False.

    Usage example:
    --------------
    >>> headers_to_split_on = [
    >>>     ("#", "Header 1"),
    >>>     ("##", "Header 2"),
    >>> ]
    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
    >>>     headers_to_split_on=headers_to_split_on
    >>> )
    >>> chunks = splitter.split(text)
    >>> for chunk in chunks:
    >>>     print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.
    """

    DEFAULT_HEADER_KEYS = {
        "#": "Header 1",
        "##": "Header 2",
        "###": "Header 3",
        "####": "Header 4",
        "#####": "Header 5",
        "######": "Header 6",
    }

    def __init__(
        self,
        headers_to_split_on: Union[List[Tuple[str, str]], None] = None,
        return_each_line: bool = False,
        strip_headers: bool = True,
    ):
        self.chunks: List[Document] = []
        self.current_chunk = Document(page_content="")
        self.current_header_stack: List[Tuple[int, str]] = []
        self.strip_headers = strip_headers
        if headers_to_split_on:
            self.splittable_headers = dict(headers_to_split_on)
        else:
            self.splittable_headers = self.DEFAULT_HEADER_KEYS

        self.return_each_line = return_each_line

    def split_text(self, text: str) -> List[Document]:
        raw_lines = text.splitlines(keepends=True)

        while raw_lines:
            raw_line = raw_lines.pop(0)
            header_match = self._match_header(raw_line)
            code_match = self._match_code(raw_line)
            horz_match = self._match_horz(raw_line)
            if header_match:
                self._complete_chunk_doc()

                if not self.strip_headers:
                    self.current_chunk.page_content += raw_line

                # add the header to the stack
                header_depth = len(header_match.group(1))
                header_text = header_match.group(2)
                self._resolve_header_stack(header_depth, header_text)
            elif code_match:
                self._complete_chunk_doc()
                self.current_chunk.page_content = self._resolve_code_chunk(
                    raw_line, raw_lines
                )
                self.current_chunk.metadata["Code"] = code_match.group(1)
                self._complete_chunk_doc()
            elif horz_match:
                self._complete_chunk_doc()
            else:
                self.current_chunk.page_content += raw_line

        self._complete_chunk_doc()
        # I don't see why `return_each_line` is a necessary feature of this splitter.
        # It's easy enough to to do outside of the class and the caller can have more
        # control over it.
        if self.return_each_line:
            return [
                Document(page_content=line, metadata=chunk.metadata)
                for chunk in self.chunks
                for line in chunk.page_content.splitlines()
                if line and not line.isspace()
            ]
        return self.chunks

    def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:
        for i, (depth, _) in enumerate(self.current_header_stack):
            if depth == header_depth:
                self.current_header_stack[i] = (header_depth, header_text)
                self.current_header_stack = self.current_header_stack[: i + 1]
                return
        self.current_header_stack.append((header_depth, header_text))

    def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:
        chunk = current_line
        while raw_lines:
            raw_line = raw_lines.pop(0)
            chunk += raw_line
            if self._match_code(raw_line):
                return chunk
        return ""

    def _complete_chunk_doc(self) -> None:
        chunk_content = self.current_chunk.page_content
        # Discard any empty documents
        if chunk_content and not chunk_content.isspace():
            # Apply the header stack as metadata
            for depth, value in self.current_header_stack:
                header_key = self.splittable_headers.get("#" * depth)
                self.current_chunk.metadata[header_key] = value
            self.chunks.append(self.current_chunk)
        # Reset the current chunk
        self.current_chunk = Document(page_content="")

    # Match methods
    def _match_header(self, line: str) -> Union[re.Match, None]:
        match = re.match(r"^(#{1,6}) (.*)", line)
        # Only matches on the configured headers
        if match and match.group(1) in self.splittable_headers:
            return match
        return None

    def _match_code(self, line: str) -> Union[re.Match, None]:
        matches = [re.match(rule, line) for rule in [r"^```(.*)", r"^~~~(.*)"]]
        return next((match for match in matches if match), None)

    def _match_horz(self, line: str) -> Union[re.Match, None]:
        matches = [
            re.match(rule, line) for rule in [r"^\*\*\*+\n", r"^---+\n", r"^___+\n"]
        ]
        return next((match for match in matches if match), None)
text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346) 2024-03-01 02:33:21 +00:00			`from __future__ import annotations`

text-splitters: Introduce Experimental Markdown Syntax Splitter (#22257) #### Description This MR defines a `ExperimentalMarkdownSyntaxTextSplitter` class. The main goal is to replicate the functionality of the original `MarkdownHeaderTextSplitter` which extracts the header stack as metadata but with one critical difference: it keeps the whitespace of the original text intact. This draft reimplements the `MarkdownHeaderTextSplitter` with a very different algorithmic approach. Instead of marking up each line of the text individually and aggregating them back together into chunks, this method builds each chunk sequentially and applies the metadata to each chunk. This makes the implementation simpler. However, since it's designed to keep white space intact its not a full drop in replacement for the original. Since it is a radical implementation change to the original code and I would like to get feedback to see if this is a worthwhile replacement, should be it's own class, or is not a good idea at all. Note: I implemented the `return_each_line` parameter but I don't think it's a necessary feature. I'd prefer to remove it. This implementation also adds the following additional features: - Splits out code blocks and includes the language in the `"Code"` metadata key - Splits text on the horizontal rule `---` as well - The `headers_to_split_on` parameter is now optional - with sensible defaults that can be overridden. #### Issue Keeping the whitespace keeps the paragraphs structure and the formatting of the code blocks intact which allows the caller much more flexibility in how they want to further split the individuals sections of the resulting documents. This addresses the issues brought up by the community in the following issues: - https://github.com/langchain-ai/langchain/issues/20823 - https://github.com/langchain-ai/langchain/issues/19436 - https://github.com/langchain-ai/langchain/issues/22256 #### Dependencies N/A #### Twitter handle @RyanElston --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu> 2024-06-19 02:44:00 +00:00			`import re`
			`from typing import Any, Dict, List, Tuple, TypedDict, Union`
text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346) 2024-03-01 02:33:21 +00:00
			`from langchain_core.documents import Document`

			`from langchain_text_splitters.base import Language`
			`from langchain_text_splitters.character import RecursiveCharacterTextSplitter`


			`class MarkdownTextSplitter(RecursiveCharacterTextSplitter):`
			`"""Attempts to split the text along Markdown-formatted headings."""`

			`def __init__(self, **kwargs: Any) -> None:`
			`"""Initialize a MarkdownTextSplitter."""`
			`separators = self.get_separators_for_language(Language.MARKDOWN)`
			`super().__init__(separators=separators, **kwargs)`


			`class MarkdownHeaderTextSplitter:`
			`"""Splitting markdown files based on specified headers."""`

			`def __init__(`
			`self,`
			`headers_to_split_on: List[Tuple[str, str]],`
			`return_each_line: bool = False,`
			`strip_headers: bool = True,`
			`):`
			`"""Create a new MarkdownHeaderTextSplitter.`

			`Args:`
			`headers_to_split_on: Headers we want to track`
			`return_each_line: Return each line w/ associated headers`
			`strip_headers: Strip split headers from the content of the chunk`
			`"""`
			`# Output line-by-line or aggregated into chunks w/ common headers`
			`self.return_each_line = return_each_line`
			`# Given the headers we want to split on,`
			`# (e.g., "#, ##, etc") order by length`
			`self.headers_to_split_on = sorted(`
			`headers_to_split_on, key=lambda split: len(split[0]), reverse=True`
			`)`
			`# Strip headers split headers from the content of the chunk`
			`self.strip_headers = strip_headers`

			`def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:`
			`"""Combine lines with common metadata into chunks`
			`Args:`
			`lines: Line of text / associated header metadata`
			`"""`
			`aggregated_chunks: List[LineType] = []`

			`for line in lines:`
			`if (`
			`aggregated_chunks`
			`and aggregated_chunks[-1]["metadata"] == line["metadata"]`
			`):`
			`# If the last line in the aggregated list`
			`# has the same metadata as the current line,`
			`# append the current content to the last lines's content`
			`aggregated_chunks[-1]["content"] += " \n" + line["content"]`
			`elif (`
			`aggregated_chunks`
			`and aggregated_chunks[-1]["metadata"] != line["metadata"]`
			`# may be issues if other metadata is present`
			`and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])`
			`and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"`
			`and not self.strip_headers`
			`):`
			`# If the last line in the aggregated list`
			`# has different metadata as the current line,`
			`# and has shallower header level than the current line,`
			`# and the last line is a header,`
			`# and we are not stripping headers,`
			`# append the current content to the last line's content`
			`aggregated_chunks[-1]["content"] += " \n" + line["content"]`
			`# and update the last line's metadata`
			`aggregated_chunks[-1]["metadata"] = line["metadata"]`
			`else:`
			`# Otherwise, append the current line to the aggregated list`
			`aggregated_chunks.append(line)`

			`return [`
			`Document(page_content=chunk["content"], metadata=chunk["metadata"])`
			`for chunk in aggregated_chunks`
			`]`

			`def split_text(self, text: str) -> List[Document]:`
			`"""Split markdown file`
			`Args:`
			`text: Markdown file"""`

			`# Split the input text by newline character ("\n").`
			`lines = text.split("\n")`
			`# Final output`
			`lines_with_metadata: List[LineType] = []`
			`# Content and metadata of the chunk currently being processed`
			`current_content: List[str] = []`
			`current_metadata: Dict[str, str] = {}`
			`# Keep track of the nested header structure`
			`# header_stack: List[Dict[str, Union[int, str]]] = []`
			`header_stack: List[HeaderType] = []`
			`initial_metadata: Dict[str, str] = {}`

			`in_code_block = False`
			`opening_fence = ""`

			`for line in lines:`
			`stripped_line = line.strip()`
text-splitters[patch]: fix MarkdownHeaderTextSplitter fails to parse headers with non-printable characters (#20645) Description: MarkdownHeaderTextSplitter Fails to Parse Headers with non-printable characters. more #20643 The following is the official test case. Just replacing `# Foo\n\n` with `\ufeff# Foo\n\n` will cause the test case to fail. chunk metadata is empty ```python def test_md_header_text_splitter_1() -> None: """Test markdown splitter by header: Case 1.""" markdown_document = ( "\ufeff# Foo\n\n" " ## Bar\n\n" "Hi this is Jim\n\n" "Hi this is Joe\n\n" " ## Baz\n\n" " Hi this is Molly" ) headers_to_split_on = [ ("#", "Header 1"), ("##", "Header 2"), ] markdown_splitter = MarkdownHeaderTextSplitter( headers_to_split_on=headers_to_split_on, ) output = markdown_splitter.split_text(markdown_document) expected_output = [ Document( page_content="Hi this is Jim \nHi this is Joe", metadata={"Header 1": "Foo", "Header 2": "Bar"}, ), Document( page_content="Hi this is Molly", metadata={"Header 1": "Foo", "Header 2": "Baz"}, ), ] assert output == expected_output ``` twitter: @coolbeevip Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-04-25 00:07:42 +00:00			`# Remove all non-printable characters from the string, keeping only visible`
			`# text.`
			`stripped_line = "".join(filter(str.isprintable, stripped_line))`
text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346) 2024-03-01 02:33:21 +00:00			`if not in_code_block:`
			`# Exclude inline code spans`
			if stripped_line.startswith("```") and stripped_line.count("```") == 1:
			`in_code_block = True`
			opening_fence = "```"
			`elif stripped_line.startswith("~~~"):`
			`in_code_block = True`
			`opening_fence = "~~~"`
			`else:`
			`if stripped_line.startswith(opening_fence):`
			`in_code_block = False`
			`opening_fence = ""`

			`if in_code_block:`
			`current_content.append(stripped_line)`
			`continue`

			`# Check each line against each of the header types (e.g., #, ##)`
			`for sep, name in self.headers_to_split_on:`
			`# Check if line starts with a header that we intend to split on`
			`if stripped_line.startswith(sep) and (`
			`# Header with no text OR header is followed by space`
			`# Both are valid conditions that sep is being used a header`
			`len(stripped_line) == len(sep) or stripped_line[len(sep)] == " "`
			`):`
			`# Ensure we are tracking the header as metadata`
			`if name is not None:`
			`# Get the current header level`
			`current_header_level = sep.count("#")`

			`# Pop out headers of lower or same level from the stack`
			`while (`
			`header_stack`
			`and header_stack[-1]["level"] >= current_header_level`
			`):`
			`# We have encountered a new header`
			`# at the same or higher level`
			`popped_header = header_stack.pop()`
			`# Clear the metadata for the`
			`# popped header in initial_metadata`
			`if popped_header["name"] in initial_metadata:`
			`initial_metadata.pop(popped_header["name"])`

			`# Push the current header to the stack`
			`header: HeaderType = {`
			`"level": current_header_level,`
			`"name": name,`
			`"data": stripped_line[len(sep) :].strip(),`
			`}`
			`header_stack.append(header)`
			`# Update initial_metadata with the current header`
			`initial_metadata[name] = header["data"]`

			`# Add the previous line to the lines_with_metadata`
			`# only if current_content is not empty`
			`if current_content:`
			`lines_with_metadata.append(`
			`{`
			`"content": "\n".join(current_content),`
			`"metadata": current_metadata.copy(),`
			`}`
			`)`
			`current_content.clear()`

			`if not self.strip_headers:`
			`current_content.append(stripped_line)`

			`break`
			`else:`
			`if stripped_line:`
			`current_content.append(stripped_line)`
			`elif current_content:`
			`lines_with_metadata.append(`
			`{`
			`"content": "\n".join(current_content),`
			`"metadata": current_metadata.copy(),`
			`}`
			`)`
			`current_content.clear()`

			`current_metadata = initial_metadata.copy()`

			`if current_content:`
			`lines_with_metadata.append(`
			`{"content": "\n".join(current_content), "metadata": current_metadata}`
			`)`

			`# lines_with_metadata has each line with associated header metadata`
			`# aggregate these into chunks based on common metadata`
			`if not self.return_each_line:`
			`return self.aggregate_lines_to_chunks(lines_with_metadata)`
			`else:`
			`return [`
			`Document(page_content=chunk["content"], metadata=chunk["metadata"])`
			`for chunk in lines_with_metadata`
			`]`


			`class LineType(TypedDict):`
			`"""Line type as typed dict."""`

			`metadata: Dict[str, str]`
			`content: str`


			`class HeaderType(TypedDict):`
			`"""Header type as typed dict."""`

			`level: int`
			`name: str`
			`data: str`
text-splitters: Introduce Experimental Markdown Syntax Splitter (#22257) #### Description This MR defines a `ExperimentalMarkdownSyntaxTextSplitter` class. The main goal is to replicate the functionality of the original `MarkdownHeaderTextSplitter` which extracts the header stack as metadata but with one critical difference: it keeps the whitespace of the original text intact. This draft reimplements the `MarkdownHeaderTextSplitter` with a very different algorithmic approach. Instead of marking up each line of the text individually and aggregating them back together into chunks, this method builds each chunk sequentially and applies the metadata to each chunk. This makes the implementation simpler. However, since it's designed to keep white space intact its not a full drop in replacement for the original. Since it is a radical implementation change to the original code and I would like to get feedback to see if this is a worthwhile replacement, should be it's own class, or is not a good idea at all. Note: I implemented the `return_each_line` parameter but I don't think it's a necessary feature. I'd prefer to remove it. This implementation also adds the following additional features: - Splits out code blocks and includes the language in the `"Code"` metadata key - Splits text on the horizontal rule `---` as well - The `headers_to_split_on` parameter is now optional - with sensible defaults that can be overridden. #### Issue Keeping the whitespace keeps the paragraphs structure and the formatting of the code blocks intact which allows the caller much more flexibility in how they want to further split the individuals sections of the resulting documents. This addresses the issues brought up by the community in the following issues: - https://github.com/langchain-ai/langchain/issues/20823 - https://github.com/langchain-ai/langchain/issues/19436 - https://github.com/langchain-ai/langchain/issues/22256 #### Dependencies N/A #### Twitter handle @RyanElston --------- Co-authored-by: isaac hershenson <ihershenson@hmc.edu> 2024-06-19 02:44:00 +00:00

			`class ExperimentalMarkdownSyntaxTextSplitter:`
			`"""`
			`An experimental text splitter for handling Markdown syntax.`

			`This splitter aims to retain the exact whitespace of the original text while`
			`extracting structured metadata, such as headers. It is a re-implementation of the`
			`MarkdownHeaderTextSplitter with notable changes to the approach and`
			`additional features.`

			`Key Features:`
			`- Retains the original whitespace and formatting of the Markdown text.`
			`- Extracts headers, code blocks, and horizontal rules as metadata.`
			`- Splits out code blocks and includes the language in the "Code" metadata key.`
			- Splits text on horizontal rules (`---`) as well.
			`- Defaults to sensible splitting behavior, which can be overridden using the`
			`headers_to_split_on` parameter.

			`Parameters:`
			`----------`
			`headers_to_split_on : List[Tuple[str, str]], optional`
			`Headers to split on, defaulting to common Markdown headers if not specified.`
			`return_each_line : bool, optional`
			`When set to True, returns each line as a separate chunk. Default is False.`

			`Usage example:`
			`--------------`
			`>>> headers_to_split_on = [`
			`>>> ("#", "Header 1"),`
			`>>> ("##", "Header 2"),`
			`>>> ]`
			`>>> splitter = ExperimentalMarkdownSyntaxTextSplitter(`
			`>>> headers_to_split_on=headers_to_split_on`
			`>>> )`
			`>>> chunks = splitter.split(text)`
			`>>> for chunk in chunks:`
			`>>> print(chunk)`

			`This class is currently experimental and subject to change based on feedback and`
			`further development.`
			`"""`

			`DEFAULT_HEADER_KEYS = {`
			`"#": "Header 1",`
			`"##": "Header 2",`
			`"###": "Header 3",`
			`"####": "Header 4",`
			`"#####": "Header 5",`
			`"######": "Header 6",`
			`}`

			`def __init__(`
			`self,`
			`headers_to_split_on: Union[List[Tuple[str, str]], None] = None,`
			`return_each_line: bool = False,`
			`strip_headers: bool = True,`
			`):`
			`self.chunks: List[Document] = []`
			`self.current_chunk = Document(page_content="")`
			`self.current_header_stack: List[Tuple[int, str]] = []`
			`self.strip_headers = strip_headers`
			`if headers_to_split_on:`
			`self.splittable_headers = dict(headers_to_split_on)`
			`else:`
			`self.splittable_headers = self.DEFAULT_HEADER_KEYS`

			`self.return_each_line = return_each_line`

			`def split_text(self, text: str) -> List[Document]:`
			`raw_lines = text.splitlines(keepends=True)`

			`while raw_lines:`
			`raw_line = raw_lines.pop(0)`
			`header_match = self._match_header(raw_line)`
			`code_match = self._match_code(raw_line)`
			`horz_match = self._match_horz(raw_line)`
			`if header_match:`
			`self._complete_chunk_doc()`

			`if not self.strip_headers:`
			`self.current_chunk.page_content += raw_line`

			`# add the header to the stack`
			`header_depth = len(header_match.group(1))`
			`header_text = header_match.group(2)`
			`self._resolve_header_stack(header_depth, header_text)`
			`elif code_match:`
			`self._complete_chunk_doc()`
			`self.current_chunk.page_content = self._resolve_code_chunk(`
			`raw_line, raw_lines`
			`)`
			`self.current_chunk.metadata["Code"] = code_match.group(1)`
			`self._complete_chunk_doc()`
			`elif horz_match:`
			`self._complete_chunk_doc()`
			`else:`
			`self.current_chunk.page_content += raw_line`

			`self._complete_chunk_doc()`
			# I don't see why `return_each_line` is a necessary feature of this splitter.
			`# It's easy enough to to do outside of the class and the caller can have more`
			`# control over it.`
			`if self.return_each_line:`
			`return [`
			`Document(page_content=line, metadata=chunk.metadata)`
			`for chunk in self.chunks`
			`for line in chunk.page_content.splitlines()`
			`if line and not line.isspace()`
			`]`
			`return self.chunks`

			`def _resolve_header_stack(self, header_depth: int, header_text: str) -> None:`
			`for i, (depth, _) in enumerate(self.current_header_stack):`
			`if depth == header_depth:`
			`self.current_header_stack[i] = (header_depth, header_text)`
			`self.current_header_stack = self.current_header_stack[: i + 1]`
			`return`
			`self.current_header_stack.append((header_depth, header_text))`

			`def _resolve_code_chunk(self, current_line: str, raw_lines: List[str]) -> str:`
			`chunk = current_line`
			`while raw_lines:`
			`raw_line = raw_lines.pop(0)`
			`chunk += raw_line`
			`if self._match_code(raw_line):`
			`return chunk`
			`return ""`

			`def _complete_chunk_doc(self) -> None:`
			`chunk_content = self.current_chunk.page_content`
			`# Discard any empty documents`
			`if chunk_content and not chunk_content.isspace():`
			`# Apply the header stack as metadata`
			`for depth, value in self.current_header_stack:`
			`header_key = self.splittable_headers.get("#" * depth)`
			`self.current_chunk.metadata[header_key] = value`
			`self.chunks.append(self.current_chunk)`
			`# Reset the current chunk`
			`self.current_chunk = Document(page_content="")`

			`# Match methods`
			`def _match_header(self, line: str) -> Union[re.Match, None]:`
			`match = re.match(r"^(#{1,6}) (.*)", line)`
			`# Only matches on the configured headers`
			`if match and match.group(1) in self.splittable_headers:`
			`return match`
			`return None`

			`def _match_code(self, line: str) -> Union[re.Match, None]:`
			matches = [re.match(rule, line) for rule in [r"^```(.)", r"^~~~(.)"]]
			`return next((match for match in matches if match), None)`

			`def _match_horz(self, line: str) -> Union[re.Match, None]:`
			`matches = [`
			`re.match(rule, line) for rule in [r"^\\\*+\n", r"^---+\n", r"^___+\n"]`
			`]`
			`return next((match for match in matches if match), None)`