mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Text splitter for Markdown files by header (#5860)
This creates a new kind of text splitter for markdown files. The user can supply a set of headers that they want to split the file on. We define a new text splitter class, `MarkdownHeaderTextSplitter`, that does a few things: (1) For each line, it determines the associated set of user-specified headers (2) It groups lines with common headers into splits See notebook for example usage and test cases.
This commit is contained in:
parent
2c91f0d750
commit
b023f0c0f2
@ -0,0 +1,324 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "70e9b619",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# MarkdownHeaderTextSplitter\n",
|
||||||
|
"\n",
|
||||||
|
"The objective is to split a markdown file by a specified set of headers.\n",
|
||||||
|
" \n",
|
||||||
|
"**Given this example:**\n",
|
||||||
|
"\n",
|
||||||
|
"# Foo\n",
|
||||||
|
"\n",
|
||||||
|
"## Bar\n",
|
||||||
|
"\n",
|
||||||
|
"Hi this is Jim \n",
|
||||||
|
"Hi this is Joe\n",
|
||||||
|
"\n",
|
||||||
|
"## Baz\n",
|
||||||
|
"\n",
|
||||||
|
"Hi this is Molly\n",
|
||||||
|
" \n",
|
||||||
|
"**Written as:**\n",
|
||||||
|
"\n",
|
||||||
|
"```\n",
|
||||||
|
"md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"**If we want to split on specified headers:**\n",
|
||||||
|
"```\n",
|
||||||
|
"[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"**Then we expect:** \n",
|
||||||
|
"```\n",
|
||||||
|
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"**Options:**\n",
|
||||||
|
" \n",
|
||||||
|
"This also includes `return_each_line` in case a user want to perform other types of aggregation. \n",
|
||||||
|
"\n",
|
||||||
|
"If `return_each_line=True`, each line and associated header metadata are returned. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "19c044f0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.text_splitter import MarkdownHeaderTextSplitter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ec8d8053",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`Test case 1`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "5cd0a66c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Doc\n",
|
||||||
|
"markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
||||||
|
" \n",
|
||||||
|
"# Test case 1\n",
|
||||||
|
"headers_to_split_on = [\n",
|
||||||
|
" (\"#\", \"Header 1\"),\n",
|
||||||
|
" (\"##\", \"Header 2\"),\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n",
|
||||||
|
"\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "67d25a1c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f1f74dfa",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`Test case 2`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "2183c96a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Text under H3.', 'metadata': {'Header 1': 'H1', 'Header 2': 'H2', 'Header 3': 'H3'}}\n",
|
||||||
|
"{'content': 'Text under H2_2.', 'metadata': {'Header 1': 'H1_2', 'Header 2': 'H2_2'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"headers_to_split_on = [\n",
|
||||||
|
" (\"#\", \"Header 1\"),\n",
|
||||||
|
" (\"##\", \"Header 2\"),\n",
|
||||||
|
" (\"###\", \"Header 3\"),\n",
|
||||||
|
"]\n",
|
||||||
|
"markdown_document = '# H1\\n\\n## H2\\n\\n### H3\\n\\nText under H3.\\n\\n# H1_2\\n\\n## H2_2\\n\\nText under H2_2.'\n",
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "add24254",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`Test case 3`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "c3f4690f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n ## Baz\\n\\n Hi this is Molly' \n",
|
||||||
|
" \n",
|
||||||
|
"headers_to_split_on = [\n",
|
||||||
|
" (\"#\", \"Header 1\"),\n",
|
||||||
|
" (\"##\", \"Header 2\"),\n",
|
||||||
|
" (\"###\", \"Header 3\"),\n",
|
||||||
|
"]\n",
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "20907fb7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9c448431",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`Test case 4`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "9858ea51",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||||
|
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
||||||
|
"{'content': 'Hi this is John', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo', 'Header 4': 'Bim'}}\n",
|
||||||
|
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n #### Bim \\n\\n Hi this is John \\n\\n ## Baz\\n\\n Hi this is Molly'\n",
|
||||||
|
" \n",
|
||||||
|
"headers_to_split_on = [\n",
|
||||||
|
" (\"#\", \"Header 1\"),\n",
|
||||||
|
" (\"##\", \"Header 2\"),\n",
|
||||||
|
" (\"###\", \"Header 3\"),\n",
|
||||||
|
" (\"####\", \"Header 4\"),\n",
|
||||||
|
"]\n",
|
||||||
|
" \n",
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bba6eb9e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`Test case 5`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "8af8f9a2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n",
|
||||||
|
"{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n",
|
||||||
|
"{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n",
|
||||||
|
"{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"markdown_document = '# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.'\n",
|
||||||
|
" \n",
|
||||||
|
"headers_to_split_on = [\n",
|
||||||
|
" (\"#\", \"Header 1\"),\n",
|
||||||
|
" (\"##\", \"Header 2\"),\n",
|
||||||
|
" (\"###\", \"Header 3\"),\n",
|
||||||
|
" (\"####\", \"Header 4\"),\n",
|
||||||
|
"]\n",
|
||||||
|
" \n",
|
||||||
|
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n",
|
||||||
|
"chunked_docs = markdown_splitter.split_text(markdown_document)\n",
|
||||||
|
"for chunk in chunked_docs:\n",
|
||||||
|
" print(chunk)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.16"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -12,12 +12,15 @@ from typing import (
|
|||||||
Any,
|
Any,
|
||||||
Callable,
|
Callable,
|
||||||
Collection,
|
Collection,
|
||||||
|
Dict,
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
Literal,
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
Sequence,
|
Sequence,
|
||||||
|
Tuple,
|
||||||
Type,
|
Type,
|
||||||
|
TypedDict,
|
||||||
TypeVar,
|
TypeVar,
|
||||||
Union,
|
Union,
|
||||||
cast,
|
cast,
|
||||||
@ -254,6 +257,153 @@ class CharacterTextSplitter(TextSplitter):
|
|||||||
return self._merge_splits(splits, _separator)
|
return self._merge_splits(splits, _separator)
|
||||||
|
|
||||||
|
|
||||||
|
class LineType(TypedDict):
|
||||||
|
metadata: Dict[str, str]
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class HeaderType(TypedDict):
|
||||||
|
level: int
|
||||||
|
name: str
|
||||||
|
data: str
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownHeaderTextSplitter:
|
||||||
|
"""Implementation of splitting markdown files based on specified headers."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False
|
||||||
|
):
|
||||||
|
"""Create a new MarkdownHeaderTextSplitter.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers_to_split_on: Headers we want to track
|
||||||
|
return_each_line: Return each line w/ associated headers
|
||||||
|
"""
|
||||||
|
# Output line-by-line or aggregated into chunks w/ common headers
|
||||||
|
self.return_each_line = return_each_line
|
||||||
|
# Given the headers we want to split on,
|
||||||
|
# (e.g., "#, ##, etc") order by length
|
||||||
|
self.headers_to_split_on = sorted(
|
||||||
|
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]:
|
||||||
|
"""Combine lines with common metadata into chunks
|
||||||
|
Args:
|
||||||
|
lines: Line of text / associated header metadata
|
||||||
|
"""
|
||||||
|
aggregated_chunks: List[LineType] = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if (
|
||||||
|
aggregated_chunks
|
||||||
|
and aggregated_chunks[-1]["metadata"] == line["metadata"]
|
||||||
|
):
|
||||||
|
# If the last line in the aggregated list
|
||||||
|
# has the same metadata as the current line,
|
||||||
|
# append the current content to the last lines's content
|
||||||
|
aggregated_chunks[-1]["content"] += " \n" + line["content"]
|
||||||
|
else:
|
||||||
|
# Otherwise, append the current line to the aggregated list
|
||||||
|
aggregated_chunks.append(line)
|
||||||
|
return aggregated_chunks
|
||||||
|
|
||||||
|
def split_text(self, text: str) -> List[LineType]:
|
||||||
|
"""Split markdown file
|
||||||
|
Args:
|
||||||
|
text: Markdown file"""
|
||||||
|
|
||||||
|
# Split the input text by newline character ("\n").
|
||||||
|
lines = text.split("\n")
|
||||||
|
# Final output
|
||||||
|
lines_with_metadata: List[LineType] = []
|
||||||
|
# Content and metadata of the chunk currently being processed
|
||||||
|
current_content: List[str] = []
|
||||||
|
current_metadata: Dict[str, str] = {}
|
||||||
|
# Keep track of the nested header structure
|
||||||
|
# header_stack: List[Dict[str, Union[int, str]]] = []
|
||||||
|
header_stack: List[HeaderType] = []
|
||||||
|
initial_metadata: Dict[str, str] = {}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped_line = line.strip()
|
||||||
|
# Check each line against each of the header types (e.g., #, ##)
|
||||||
|
for sep, name in self.headers_to_split_on:
|
||||||
|
# Check if line starts with a header that we intend to split on
|
||||||
|
if stripped_line.startswith(sep) and (
|
||||||
|
# Header with no text OR header is followed by space
|
||||||
|
# Both are valid conditions that sep is being used a header
|
||||||
|
len(stripped_line) == len(sep)
|
||||||
|
or stripped_line[len(sep)] == " "
|
||||||
|
):
|
||||||
|
# Ensure we are tracking the header as metadata
|
||||||
|
if name is not None:
|
||||||
|
# Get the current header level
|
||||||
|
current_header_level = sep.count("#")
|
||||||
|
|
||||||
|
# Pop out headers of lower or same level from the stack
|
||||||
|
while (
|
||||||
|
header_stack
|
||||||
|
and header_stack[-1]["level"] >= current_header_level
|
||||||
|
):
|
||||||
|
# We have encountered a new header
|
||||||
|
# at the same or higher level
|
||||||
|
popped_header = header_stack.pop()
|
||||||
|
# Clear the metadata for the
|
||||||
|
# popped header in initial_metadata
|
||||||
|
if popped_header["name"] in initial_metadata:
|
||||||
|
initial_metadata.pop(popped_header["name"])
|
||||||
|
|
||||||
|
# Push the current header to the stack
|
||||||
|
header: HeaderType = {
|
||||||
|
"level": current_header_level,
|
||||||
|
"name": name,
|
||||||
|
"data": stripped_line[len(sep) :].strip(),
|
||||||
|
}
|
||||||
|
header_stack.append(header)
|
||||||
|
# Update initial_metadata with the current header
|
||||||
|
initial_metadata[name] = header["data"]
|
||||||
|
|
||||||
|
# Add the previous line to the lines_with_metadata
|
||||||
|
# only if current_content is not empty
|
||||||
|
if current_content:
|
||||||
|
lines_with_metadata.append(
|
||||||
|
{
|
||||||
|
"content": "\n".join(current_content),
|
||||||
|
"metadata": current_metadata.copy(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
current_content.clear()
|
||||||
|
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if stripped_line:
|
||||||
|
current_content.append(stripped_line)
|
||||||
|
elif current_content:
|
||||||
|
lines_with_metadata.append(
|
||||||
|
{
|
||||||
|
"content": "\n".join(current_content),
|
||||||
|
"metadata": current_metadata.copy(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
current_content.clear()
|
||||||
|
|
||||||
|
current_metadata = initial_metadata.copy()
|
||||||
|
|
||||||
|
if current_content:
|
||||||
|
lines_with_metadata.append(
|
||||||
|
{"content": "\n".join(current_content), "metadata": current_metadata}
|
||||||
|
)
|
||||||
|
|
||||||
|
# lines_with_metadata has each line with associated header metadata
|
||||||
|
# aggregate these into chunks based on common metadata
|
||||||
|
if not self.return_each_line:
|
||||||
|
return self.aggregate_lines_to_chunks(lines_with_metadata)
|
||||||
|
else:
|
||||||
|
return lines_with_metadata
|
||||||
|
|
||||||
|
|
||||||
# should be in newer Python versions (3.10+)
|
# should be in newer Python versions (3.10+)
|
||||||
# @dataclass(frozen=True, kw_only=True, slots=True)
|
# @dataclass(frozen=True, kw_only=True, slots=True)
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user