Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)

- **Description:** `MarkdownHeaderTextSplitter` currently strips header lines from chunked content. Many applications require these header lines are preserved. This adds an optional parameter to preserve those headers in the chunked content. - **Issue:** #2836 (relevant) - **Dependencies:** - - **Tag maintainer:** @baskaryan - **Twitter handle:** @finnless Unit tests and new examples in notebook included. cc @rlancemartin --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
6 months ago · 6c4b5a4eff
parent 0a7d360ba4
commit 6c4b5a4eff
3 changed files with 151 additions and 7 deletions
--- a/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb
+++ b/docs/docs/modules/data_connection/document_transformers/markdown_header_metadata.ipynb
@ -117,6 +117,41 @@
    "type(md_header_splits[0])"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "102aad57-7bef-42d3-ab4e-b50d6dc11718",
+   "metadata": {},
+   "source": [
+    "By default, `MarkdownHeaderTextSplitter` strips headers being split on from the output chunk's content. This can be disabled by setting `strip_headers = False`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9fce45ba-a4be-4a69-ad27-f5ff195c4fd7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='# Foo  \\n## Bar  \\nHi this is Jim  \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
+       " Document(page_content='### Boo  \\nHi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
+       " Document(page_content='## Baz  \\nHi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "markdown_splitter = MarkdownHeaderTextSplitter(\n",
+    "    headers_to_split_on=headers_to_split_on, strip_headers=False\n",
+    ")\n",
+    "md_header_splits = markdown_splitter.split_text(markdown_document)\n",
+    "md_header_splits"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "9bd8977a",
@ -127,7 +162,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "id": "480e0e3a",
   "metadata": {
    "ExecuteTime": {
@ -139,14 +174,14 @@
    {
     "data": {
      "text/plain": [
-       "[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
+       "[Document(page_content='# Intro  \\n## History  \\nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
       " Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
-       " Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.  \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
+       " Document(page_content='## Rise and divergence  \\nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
       " Document(page_content='#### Standardization  \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
-       " Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
+       " Document(page_content='## Implementations  \\nImplementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
      ]
     },
-     "execution_count": 4,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -160,7 +195,9 @@
    "]\n",
    "\n",
    "# MD splits\n",
-    "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
+    "markdown_splitter = MarkdownHeaderTextSplitter(\n",
+    "    headers_to_split_on=headers_to_split_on, strip_headers=False\n",
+    ")\n",
    "md_header_splits = markdown_splitter.split_text(markdown_document)\n",
    "\n",
    "# Char-level splits\n",
--- a/libs/langchain/langchain/text_splitter.py
+++ b/libs/langchain/langchain/text_splitter.py
@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
    """Splitting markdown files based on specified headers."""

    def __init__(
-        self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
+        self,
+        headers_to_split_on: List[Tuple[str, str]],
+        return_each_line: bool = False,
+        strip_headers: bool = True,
    ):
        """Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
+            strip_headers: Strip split headers from the content of the chunk
        """
        # Output line-by-line or aggregated into chunks w/ common headers
        self.return_each_line = return_each_line
@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
        self.headers_to_split_on = sorted(
            headers_to_split_on, key=lambda split: len(split[0]), reverse=True
        )
+        # Strip headers split headers from the content of the chunk
+        self.strip_headers = strip_headers

    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
        """Combine lines with common metadata into chunks
@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
                # has the same metadata as the current line,
                # append the current content to the last lines's content
                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
+            elif (
+                aggregated_chunks
+                and aggregated_chunks[-1]["metadata"] != line["metadata"]
+                # may be issues if other metadata is present
+                and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
+                and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
+                and not self.strip_headers
+            ):
+                # If the last line in the aggregated list
+                # has different metadata as the current line,
+                # and has shallower header level than the current line,
+                # and the last line is a header,
+                # and we are not stripping headers,
+                # append the current content to the last line's content
+                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
+                # and update the last line's metadata
+                aggregated_chunks[-1]["metadata"] = line["metadata"]
            else:
                # Otherwise, append the current line to the aggregated list
                aggregated_chunks.append(line)
@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
                        )
                        current_content.clear()

+                    if not self.strip_headers:
+                        current_content.append(stripped_line)
+
                    break
            else:
                if stripped_line:
--- a/libs/langchain/tests/unit_tests/test_text_splitter.py
+++ b/libs/langchain/tests/unit_tests/test_text_splitter.py
@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
    assert output == expected_output


+def test_md_header_text_splitter_preserve_headers_1() -> None:
+    """Test markdown splitter by header: Preserve Headers."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bat\n\n"
+        "Hi this is Jim\n\n"
+        "Hi Joe\n\n"
+        "## Baz\n\n"
+        "# Bar\n\n"
+        "This is Alice\n\n"
+        "This is Bob"
+    )
+    headers_to_split_on = [
+        ("#", "Header 1"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        strip_headers=False,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        Document(
+            page_content="# Foo  \n## Bat  \nHi this is Jim  \nHi Joe  \n## Baz",
+            metadata={"Header 1": "Foo"},
+        ),
+        Document(
+            page_content="# Bar  \nThis is Alice  \nThis is Bob",
+            metadata={"Header 1": "Bar"},
+        ),
+    ]
+    assert output == expected_output
+
+
+def test_md_header_text_splitter_preserve_headers_2() -> None:
+    """Test markdown splitter by header: Preserve Headers."""
+
+    markdown_document = (
+        "# Foo\n\n"
+        "    ## Bar\n\n"
+        "Hi this is Jim\n\n"
+        "Hi this is Joe\n\n"
+        "### Boo \n\n"
+        "Hi this is Lance\n\n"
+        "## Baz\n\n"
+        "Hi this is Molly\n"
+        "    ## Buz\n"
+        "# Bop"
+    )
+    headers_to_split_on = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+    ]
+    markdown_splitter = MarkdownHeaderTextSplitter(
+        headers_to_split_on=headers_to_split_on,
+        strip_headers=False,
+    )
+    output = markdown_splitter.split_text(markdown_document)
+    expected_output = [
+        Document(
+            page_content="# Foo  \n## Bar  \nHi this is Jim  \nHi this is Joe",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+        Document(
+            page_content="### Boo  \nHi this is Lance",
+            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        ),
+        Document(
+            page_content="## Baz  \nHi this is Molly",
+            metadata={"Header 1": "Foo", "Header 2": "Baz"},
+        ),
+        Document(
+            page_content="## Buz",
+            metadata={"Header 1": "Foo", "Header 2": "Buz"},
+        ),
+        Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
+    ]
+    assert output == expected_output
+
+
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
 def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
    """Test markdown splitter by header: Fenced code block."""