Add option to preserve headers in MarkdownHeaderTextSplitter (#14433)

- **Description:** `MarkdownHeaderTextSplitter` currently strips header
lines from chunked content. Many applications require these header lines
are preserved. This adds an optional parameter to preserve those headers
in the chunked content.
  - **Issue:** #2836 (relevant)
  - **Dependencies:** -
  - **Tag maintainer:** @baskaryan
  - **Twitter handle:** @finnless

Unit tests and new examples in notebook included.

cc @rlancemartin

---------

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
pull/14990/head
Nolan 6 months ago committed by GitHub
parent 0a7d360ba4
commit 6c4b5a4eff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -117,6 +117,41 @@
"type(md_header_splits[0])"
]
},
{
"cell_type": "markdown",
"id": "102aad57-7bef-42d3-ab4e-b50d6dc11718",
"metadata": {},
"source": [
"By default, `MarkdownHeaderTextSplitter` strips headers being split on from the output chunk's content. This can be disabled by setting `strip_headers = False`."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9fce45ba-a4be-4a69-ad27-f5ff195c4fd7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='# Foo \\n## Bar \\nHi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
" Document(page_content='### Boo \\nHi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
" Document(page_content='## Baz \\nHi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
")\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"md_header_splits"
]
},
{
"cell_type": "markdown",
"id": "9bd8977a",
@ -127,7 +162,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "480e0e3a",
"metadata": {
"ExecuteTime": {
@ -139,14 +174,14 @@
{
"data": {
"text/plain": [
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
"[Document(page_content='# Intro \\n## History \\nMarkdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='## Rise and divergence \\nAs Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
" Document(page_content='## Implementations \\nImplementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -160,7 +195,9 @@
"]\n",
"\n",
"# MD splits\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"markdown_splitter = MarkdownHeaderTextSplitter(\n",
" headers_to_split_on=headers_to_split_on, strip_headers=False\n",
")\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"\n",
"# Char-level splits\n",

@ -323,13 +323,17 @@ class MarkdownHeaderTextSplitter:
"""Splitting markdown files based on specified headers."""
def __init__(
self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
self,
headers_to_split_on: List[Tuple[str, str]],
return_each_line: bool = False,
strip_headers: bool = True,
):
"""Create a new MarkdownHeaderTextSplitter.
Args:
headers_to_split_on: Headers we want to track
return_each_line: Return each line w/ associated headers
strip_headers: Strip split headers from the content of the chunk
"""
# Output line-by-line or aggregated into chunks w/ common headers
self.return_each_line = return_each_line
@ -338,6 +342,8 @@ class MarkdownHeaderTextSplitter:
self.headers_to_split_on = sorted(
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
)
# Strip headers split headers from the content of the chunk
self.strip_headers = strip_headers
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
"""Combine lines with common metadata into chunks
@ -355,6 +361,23 @@ class MarkdownHeaderTextSplitter:
# has the same metadata as the current line,
# append the current content to the last lines's content
aggregated_chunks[-1]["content"] += " \n" + line["content"]
elif (
aggregated_chunks
and aggregated_chunks[-1]["metadata"] != line["metadata"]
# may be issues if other metadata is present
and len(aggregated_chunks[-1]["metadata"]) < len(line["metadata"])
and aggregated_chunks[-1]["content"].split("\n")[-1][0] == "#"
and not self.strip_headers
):
# If the last line in the aggregated list
# has different metadata as the current line,
# and has shallower header level than the current line,
# and the last line is a header,
# and we are not stripping headers,
# append the current content to the last line's content
aggregated_chunks[-1]["content"] += " \n" + line["content"]
# and update the last line's metadata
aggregated_chunks[-1]["metadata"] = line["metadata"]
else:
# Otherwise, append the current line to the aggregated list
aggregated_chunks.append(line)
@ -451,6 +474,9 @@ class MarkdownHeaderTextSplitter:
)
current_content.clear()
if not self.strip_headers:
current_content.append(stripped_line)
break
else:
if stripped_line:

@ -1035,6 +1035,87 @@ def test_md_header_text_splitter_3() -> None:
assert output == expected_output
def test_md_header_text_splitter_preserve_headers_1() -> None:
"""Test markdown splitter by header: Preserve Headers."""
markdown_document = (
"# Foo\n\n"
" ## Bat\n\n"
"Hi this is Jim\n\n"
"Hi Joe\n\n"
"## Baz\n\n"
"# Bar\n\n"
"This is Alice\n\n"
"This is Bob"
)
headers_to_split_on = [
("#", "Header 1"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
strip_headers=False,
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
Document(
page_content="# Foo \n## Bat \nHi this is Jim \nHi Joe \n## Baz",
metadata={"Header 1": "Foo"},
),
Document(
page_content="# Bar \nThis is Alice \nThis is Bob",
metadata={"Header 1": "Bar"},
),
]
assert output == expected_output
def test_md_header_text_splitter_preserve_headers_2() -> None:
"""Test markdown splitter by header: Preserve Headers."""
markdown_document = (
"# Foo\n\n"
" ## Bar\n\n"
"Hi this is Jim\n\n"
"Hi this is Joe\n\n"
"### Boo \n\n"
"Hi this is Lance\n\n"
"## Baz\n\n"
"Hi this is Molly\n"
" ## Buz\n"
"# Bop"
)
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on,
strip_headers=False,
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
Document(
page_content="# Foo \n## Bar \nHi this is Jim \nHi this is Joe",
metadata={"Header 1": "Foo", "Header 2": "Bar"},
),
Document(
page_content="### Boo \nHi this is Lance",
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
),
Document(
page_content="## Baz \nHi this is Molly",
metadata={"Header 1": "Foo", "Header 2": "Baz"},
),
Document(
page_content="## Buz",
metadata={"Header 1": "Foo", "Header 2": "Buz"},
),
Document(page_content="# Bop", metadata={"Header 1": "Bop"}),
]
assert output == expected_output
@pytest.mark.parametrize("fence", [("```"), ("~~~")])
def test_md_header_text_splitter_fenced_code_block(fence: str) -> None:
"""Test markdown splitter by header: Fenced code block."""

Loading…
Cancel
Save