diff --git a/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb index db300d63..d706f0f6 100644 --- a/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb +++ b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb @@ -7,48 +7,30 @@ "source": [ "# MarkdownHeaderTextSplitter\n", "\n", - "The objective is to split a markdown file by a specified set of headers.\n", - " \n", - "**Given this example:**\n", - "\n", - "# Foo\n", - "\n", - "## Bar\n", - "\n", - "Hi this is Jim \n", - "Hi this is Joe\n", - "\n", - "## Baz\n", - "\n", - "Hi this is Molly\n", - " \n", - "**Written as:**\n", - "\n", + "This splits a markdown file by a specified set of headers. For example, if we want to split this markdown:\n", "```\n", "md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", "```\n", "\n", - "**If we want to split on specified headers:**\n", + "Headers to split on:\n", "```\n", "[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n", "```\n", "\n", - "**Then we expect:** \n", + "Expected output:\n", "```\n", "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n", "```\n", "\n", - "**Options:**\n", - " \n", - "This also includes `return_each_line` in case a user want to perform other types of aggregation. \n", + "Optionally, this also includes `return_each_line` in case a user want to perform other types of aggregation. \n", "\n", - "If `return_each_line=True`, each line and associated header metadata are returned. " + "If `return_each_line=True`, each line and associated header metadata are simply returned. " ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "19c044f0", "metadata": {}, "outputs": [], @@ -56,117 +38,10 @@ "from langchain.text_splitter import MarkdownHeaderTextSplitter" ] }, - { - "cell_type": "markdown", - "id": "ec8d8053", - "metadata": {}, - "source": [ - "`Test case 1`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5cd0a66c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" - ] - } - ], - "source": [ - "# Doc\n", - "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", - " \n", - "# Test case 1\n", - "headers_to_split_on = [\n", - " (\"#\", \"Header 1\"),\n", - " (\"##\", \"Header 2\"),\n", - "]\n", - "\n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", - "\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "67d25a1c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" - ] - } - ], - "source": [ - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" - ] - }, - { - "cell_type": "markdown", - "id": "f1f74dfa", - "metadata": {}, - "source": [ - "`Test case 2`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2183c96a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Text under H3.', 'metadata': {'Header 1': 'H1', 'Header 2': 'H2', 'Header 3': 'H3'}}\n", - "{'content': 'Text under H2_2.', 'metadata': {'Header 1': 'H1_2', 'Header 2': 'H2_2'}}\n" - ] - } - ], - "source": [ - "headers_to_split_on = [\n", - " (\"#\", \"Header 1\"),\n", - " (\"##\", \"Header 2\"),\n", - " (\"###\", \"Header 3\"),\n", - "]\n", - "markdown_document = '# H1\\n\\n## H2\\n\\n### H3\\n\\nText under H3.\\n\\n# H1_2\\n\\n## H2_2\\n\\nText under H2_2.'\n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" - ] - }, - { - "cell_type": "markdown", - "id": "add24254", - "metadata": {}, - "source": [ - "`Test case 3`" - ] - }, { "cell_type": "code", - "execution_count": 6, - "id": "c3f4690f", + "execution_count": 9, + "id": "2ae3649b", "metadata": {}, "outputs": [ { @@ -187,88 +62,24 @@ " (\"##\", \"Header 2\"),\n", " (\"###\", \"Header 3\"),\n", "]\n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "20907fb7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", - "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" - ] - } - ], - "source": [ - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" - ] - }, - { - "cell_type": "markdown", - "id": "9c448431", - "metadata": {}, - "source": [ - "`Test case 4`" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "9858ea51", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", - "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", - "{'content': 'Hi this is John', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo', 'Header 4': 'Bim'}}\n", - "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" - ] - } - ], - "source": [ - "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n #### Bim \\n\\n Hi this is John \\n\\n ## Baz\\n\\n Hi this is Molly'\n", - " \n", - "headers_to_split_on = [\n", - " (\"#\", \"Header 1\"),\n", - " (\"##\", \"Header 2\"),\n", - " (\"###\", \"Header 3\"),\n", - " (\"####\", \"Header 4\"),\n", - "]\n", - " \n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" + "\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n", + "splits = markdown_splitter.split_text(markdown_document)\n", + "for split in splits:\n", + " print(split)" ] }, { "cell_type": "markdown", - "id": "bba6eb9e", + "id": "2a32026a", "metadata": {}, "source": [ - "`Test case 5`" + "Here's an example on a larger file with `return_each_line=True` passed, allowing each line to be examined." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "8af8f9a2", "metadata": {}, "outputs": [ @@ -276,8 +87,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", - "{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", + "{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", + "{'content': 'Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", + "{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", + "{'content': 'additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", "{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n", "{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n" ] @@ -293,11 +106,19 @@ " (\"####\", \"Header 4\"),\n", "]\n", " \n", - "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", - "chunked_docs = markdown_splitter.split_text(markdown_document)\n", - "for chunk in chunked_docs:\n", - " print(chunk)" + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", + "splits = markdown_splitter.split_text(markdown_document)\n", + "for line in splits:\n", + " print(line)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "987183f2", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py index 91730b03..db013579 100644 --- a/tests/unit_tests/test_text_splitter.py +++ b/tests/unit_tests/test_text_splitter.py @@ -7,6 +7,7 @@ from langchain.docstore.document import Document from langchain.text_splitter import ( CharacterTextSplitter, Language, + MarkdownHeaderTextSplitter, PythonCodeTextSplitter, RecursiveCharacterTextSplitter, ) @@ -671,3 +672,129 @@ def test_html_code_splitter() -> None: "

Some text

", "

Some more text

\n ", ] + + +def test_md_header_text_splitter_1() -> None: + """Test markdown splitter by header: Case 1.""" + + markdown_document = ( + "# Foo\n\n" + " ## Bar\n\n" + "Hi this is Jim\n\n" + "Hi this is Joe\n\n" + " ## Baz\n\n" + " Hi this is Molly" + ) + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ] + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + ) + output = markdown_splitter.split_text(markdown_document) + expected_output = [ + { + "content": "Hi this is Jim \nHi this is Joe", + "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, + }, + { + "content": "Hi this is Molly", + "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, + }, + ] + assert output == expected_output + + +def test_md_header_text_splitter_2() -> None: + """Test markdown splitter by header: Case 2.""" + markdown_document = ( + "# Foo\n\n" + " ## Bar\n\n" + "Hi this is Jim\n\n" + "Hi this is Joe\n\n" + " ### Boo \n\n" + " Hi this is Lance \n\n" + " ## Baz\n\n" + " Hi this is Molly" + ) + + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ] + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + ) + output = markdown_splitter.split_text(markdown_document) + expected_output = [ + { + "content": "Hi this is Jim \nHi this is Joe", + "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, + }, + { + "content": "Hi this is Lance", + "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, + }, + { + "content": "Hi this is Molly", + "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, + }, + ] + assert output == expected_output + + +def test_md_header_text_splitter_3() -> None: + """Test markdown splitter by header: Case 3.""" + + markdown_document = ( + "# Foo\n\n" + " ## Bar\n\n" + "Hi this is Jim\n\n" + "Hi this is Joe\n\n" + " ### Boo \n\n" + " Hi this is Lance \n\n" + " #### Bim \n\n" + " Hi this is John \n\n" + " ## Baz\n\n" + " Hi this is Molly" + ) + + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ("####", "Header 4"), + ] + + markdown_splitter = MarkdownHeaderTextSplitter( + headers_to_split_on=headers_to_split_on, + ) + output = markdown_splitter.split_text(markdown_document) + + expected_output = [ + { + "content": "Hi this is Jim \nHi this is Joe", + "metadata": {"Header 1": "Foo", "Header 2": "Bar"}, + }, + { + "content": "Hi this is Lance", + "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"}, + }, + { + "content": "Hi this is John", + "metadata": { + "Header 1": "Foo", + "Header 2": "Bar", + "Header 3": "Boo", + "Header 4": "Bim", + }, + }, + { + "content": "Hi this is Molly", + "metadata": {"Header 1": "Foo", "Header 2": "Baz"}, + }, + ] + + assert output == expected_output