From b023f0c0f244f890c8d9102776b9586eb603cb2a Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Mon, 12 Jun 2023 15:46:42 -0700 Subject: [PATCH] Text splitter for Markdown files by header (#5860) This creates a new kind of text splitter for markdown files. The user can supply a set of headers that they want to split the file on. We define a new text splitter class, `MarkdownHeaderTextSplitter`, that does a few things: (1) For each line, it determines the associated set of user-specified headers (2) It groups lines with common headers into splits See notebook for example usage and test cases. --- .../examples/markdown_header_metadata.ipynb | 324 ++++++++++++++++++ langchain/text_splitter.py | 150 ++++++++ 2 files changed, 474 insertions(+) create mode 100644 docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb diff --git a/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb new file mode 100644 index 0000000000..db300d6307 --- /dev/null +++ b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70e9b619", + "metadata": {}, + "source": [ + "# MarkdownHeaderTextSplitter\n", + "\n", + "The objective is to split a markdown file by a specified set of headers.\n", + " \n", + "**Given this example:**\n", + "\n", + "# Foo\n", + "\n", + "## Bar\n", + "\n", + "Hi this is Jim \n", + "Hi this is Joe\n", + "\n", + "## Baz\n", + "\n", + "Hi this is Molly\n", + " \n", + "**Written as:**\n", + "\n", + "```\n", + "md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + "```\n", + "\n", + "**If we want to split on specified headers:**\n", + "```\n", + "[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n", + "```\n", + "\n", + "**Then we expect:** \n", + "```\n", + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n", + "```\n", + "\n", + "**Options:**\n", + " \n", + "This also includes `return_each_line` in case a user want to perform other types of aggregation. \n", + "\n", + "If `return_each_line=True`, each line and associated header metadata are returned. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "19c044f0", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import MarkdownHeaderTextSplitter" + ] + }, + { + "cell_type": "markdown", + "id": "ec8d8053", + "metadata": {}, + "source": [ + "`Test case 1`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5cd0a66c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "# Doc\n", + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + " \n", + "# Test case 1\n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + "]\n", + "\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", + "\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "67d25a1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "f1f74dfa", + "metadata": {}, + "source": [ + "`Test case 2`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2183c96a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Text under H3.', 'metadata': {'Header 1': 'H1', 'Header 2': 'H2', 'Header 3': 'H3'}}\n", + "{'content': 'Text under H2_2.', 'metadata': {'Header 1': 'H1_2', 'Header 2': 'H2_2'}}\n" + ] + } + ], + "source": [ + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + "]\n", + "markdown_document = '# H1\\n\\n## H2\\n\\n### H3\\n\\nText under H3.\\n\\n# H1_2\\n\\n## H2_2\\n\\nText under H2_2.'\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "add24254", + "metadata": {}, + "source": [ + "`Test case 3`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c3f4690f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n ## Baz\\n\\n Hi this is Molly' \n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + "]\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "20907fb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "9c448431", + "metadata": {}, + "source": [ + "`Test case 4`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9858ea51", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is John', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo', 'Header 4': 'Bim'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n #### Bim \\n\\n Hi this is John \\n\\n ## Baz\\n\\n Hi this is Molly'\n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + " (\"####\", \"Header 4\"),\n", + "]\n", + " \n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "bba6eb9e", + "metadata": {}, + "source": [ + "`Test case 5`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8af8f9a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", + "{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", + "{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n", + "{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.'\n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + " (\"####\", \"Header 4\"),\n", + "]\n", + " \n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 8955950558..15723b66d0 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -12,12 +12,15 @@ from typing import ( Any, Callable, Collection, + Dict, Iterable, List, Literal, Optional, Sequence, + Tuple, Type, + TypedDict, TypeVar, Union, cast, @@ -254,6 +257,153 @@ class CharacterTextSplitter(TextSplitter): return self._merge_splits(splits, _separator) +class LineType(TypedDict): + metadata: Dict[str, str] + content: str + + +class HeaderType(TypedDict): + level: int + name: str + data: str + + +class MarkdownHeaderTextSplitter: + """Implementation of splitting markdown files based on specified headers.""" + + def __init__( + self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False + ): + """Create a new MarkdownHeaderTextSplitter. + + Args: + headers_to_split_on: Headers we want to track + return_each_line: Return each line w/ associated headers + """ + # Output line-by-line or aggregated into chunks w/ common headers + self.return_each_line = return_each_line + # Given the headers we want to split on, + # (e.g., "#, ##, etc") order by length + self.headers_to_split_on = sorted( + headers_to_split_on, key=lambda split: len(split[0]), reverse=True + ) + + def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]: + """Combine lines with common metadata into chunks + Args: + lines: Line of text / associated header metadata + """ + aggregated_chunks: List[LineType] = [] + + for line in lines: + if ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] == line["metadata"] + ): + # If the last line in the aggregated list + # has the same metadata as the current line, + # append the current content to the last lines's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + else: + # Otherwise, append the current line to the aggregated list + aggregated_chunks.append(line) + return aggregated_chunks + + def split_text(self, text: str) -> List[LineType]: + """Split markdown file + Args: + text: Markdown file""" + + # Split the input text by newline character ("\n"). + lines = text.split("\n") + # Final output + lines_with_metadata: List[LineType] = [] + # Content and metadata of the chunk currently being processed + current_content: List[str] = [] + current_metadata: Dict[str, str] = {} + # Keep track of the nested header structure + # header_stack: List[Dict[str, Union[int, str]]] = [] + header_stack: List[HeaderType] = [] + initial_metadata: Dict[str, str] = {} + + for line in lines: + stripped_line = line.strip() + # Check each line against each of the header types (e.g., #, ##) + for sep, name in self.headers_to_split_on: + # Check if line starts with a header that we intend to split on + if stripped_line.startswith(sep) and ( + # Header with no text OR header is followed by space + # Both are valid conditions that sep is being used a header + len(stripped_line) == len(sep) + or stripped_line[len(sep)] == " " + ): + # Ensure we are tracking the header as metadata + if name is not None: + # Get the current header level + current_header_level = sep.count("#") + + # Pop out headers of lower or same level from the stack + while ( + header_stack + and header_stack[-1]["level"] >= current_header_level + ): + # We have encountered a new header + # at the same or higher level + popped_header = header_stack.pop() + # Clear the metadata for the + # popped header in initial_metadata + if popped_header["name"] in initial_metadata: + initial_metadata.pop(popped_header["name"]) + + # Push the current header to the stack + header: HeaderType = { + "level": current_header_level, + "name": name, + "data": stripped_line[len(sep) :].strip(), + } + header_stack.append(header) + # Update initial_metadata with the current header + initial_metadata[name] = header["data"] + + # Add the previous line to the lines_with_metadata + # only if current_content is not empty + if current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + break + else: + if stripped_line: + current_content.append(stripped_line) + elif current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + current_metadata = initial_metadata.copy() + + if current_content: + lines_with_metadata.append( + {"content": "\n".join(current_content), "metadata": current_metadata} + ) + + # lines_with_metadata has each line with associated header metadata + # aggregate these into chunks based on common metadata + if not self.return_each_line: + return self.aggregate_lines_to_chunks(lines_with_metadata) + else: + return lines_with_metadata + + # should be in newer Python versions (3.10+) # @dataclass(frozen=True, kw_only=True, slots=True) @dataclass(frozen=True)