MD header text splitter returns Documents (#6571)

Return `Documents` from MD header text splitter to simplify UX.

Updates the test as well as example notebooks.
This commit is contained in:
Lance Martin 2023-06-22 09:25:38 -07:00 committed by GitHub
parent 3436da65a4
commit 30f7288082
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 116 additions and 190 deletions

View File

@ -50,8 +50,8 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "19c044f0",
"execution_count": 2,
"id": "ceb3c1fb",
"metadata": {},
"outputs": [],
"source": [
@ -65,13 +65,16 @@
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
]
"data": {
"text/plain": [
"[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
" Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
" Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
@ -85,8 +88,28 @@
"\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
"for split in md_header_splits:\n",
" print(split)"
"md_header_splits"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aac1738c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"langchain.schema.Document"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(md_header_splits[0])"
]
},
{
@ -99,10 +122,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "480e0e3a",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
"\n",
@ -117,60 +155,13 @@
"\n",
"# Char-level splits\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"chunk_size = 10\n",
"chunk_overlap = 0\n",
"chunk_size = 250\n",
"chunk_overlap = 30\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
"\n",
"# Split within each header group\n",
"all_splits=[]\n",
"all_metadatas=[] \n",
"for header_group in md_header_splits:\n",
" _splits = text_splitter.split_text(header_group['content'])\n",
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
" all_splits += _splits\n",
" all_metadatas += _metadatas"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3f5d775e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Markdown[9'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_splits[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "33ab0d5c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Header 1': 'Intro', 'Header 2': 'History'}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_metadatas[0]"
"# Split\n",
"splits = text_splitter.split_documents(md_header_splits)\n",
"splits"
]
}
],

View File

@ -9,11 +9,11 @@
"\n",
"Text splitting for vector storage often uses sentences or other delimiters [to keep related text together](https://www.pinecone.io/learn/chunking-strategies/). \n",
"\n",
"But many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting. \n",
"But many documents (such as `Markdown` files) have structure (headers) that can be explicitly used in splitting. \n",
"\n",
"We added a new text splitter for Markdown files that lets a user split based specified headers. \n",
"The `MarkdownHeaderTextSplitter` lets a user split `Markdown` files files based on specified headers. \n",
"\n",
"This results in chunks that retain the header(s) that it came from (e.g., Introduction) in the chunk metadata.\n",
"This results in chunks that retain the header(s) that it came from in the metadata.\n",
"\n",
"This works nicely w/ `SelfQueryRetriever`.\n",
"\n",
@ -30,19 +30,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cda52c2c",
"execution_count": null,
"id": "2e587f65",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/31treehaus/miniconda3/envs/langchain-new/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
" warnings.warn(\n"
]
}
],
"outputs": [],
"source": [
"# Load Notion page as a markdownfile file\n",
"from langchain.document_loaders import NotionDirectoryLoader\n",
@ -54,22 +45,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"id": "730b84f2",
"execution_count": null,
"id": "1cd3fd7e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that weve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n",
" 'metadata': {'Section': 'Evaluation'}}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Let's create groups based on the section headers in our page\n",
"from langchain.text_splitter import MarkdownHeaderTextSplitter\n",
@ -77,8 +56,7 @@
" (\"###\", \"Section\"),\n",
"]\n",
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
"md_header_splits = markdown_splitter.split_text(md_file)\n",
"md_header_splits[3]"
"md_header_splits = markdown_splitter.split_text(md_file)"
]
},
{
@ -86,7 +64,7 @@
"id": "4f73a609",
"metadata": {},
"source": [
"Now, we split the text in each header group and keep the group as metadata."
"Now, perform text splitting on the header grouped documents. "
]
},
{
@ -101,57 +79,7 @@
"chunk_size = 500\n",
"chunk_overlap = 0\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
" \n",
"# Create splits within each header group and combine them\n",
"all_splits=[]\n",
"all_metadatas=[]\n",
"for header_group in md_header_splits:\n",
" _splits = text_splitter.split_text(header_group['content'])\n",
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
" all_splits += _splits\n",
" all_metadatas += _metadatas"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "7424f78b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_splits[6]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "08f5db3a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Section': 'Motivation'}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_metadatas[6]"
"all_splits = text_splitter.split_documents(md_header_splits)"
]
},
{
@ -183,7 +111,7 @@
"source": [
"# Build vectorstore and keep the metadata\n",
"from langchain.vectorstores import Chroma\n",
"vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
"vectorstore = Chroma.from_documents(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
]
},
{

View File

@ -288,7 +288,7 @@ class MarkdownHeaderTextSplitter:
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
)
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]:
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
"""Combine lines with common metadata into chunks
Args:
lines: Line of text / associated header metadata
@ -307,9 +307,13 @@ class MarkdownHeaderTextSplitter:
else:
# Otherwise, append the current line to the aggregated list
aggregated_chunks.append(line)
return aggregated_chunks
def split_text(self, text: str) -> List[LineType]:
return [
Document(page_content=chunk["content"], metadata=chunk["metadata"])
for chunk in aggregated_chunks
]
def split_text(self, text: str) -> List[Document]:
"""Split markdown file
Args:
text: Markdown file"""
@ -401,7 +405,10 @@ class MarkdownHeaderTextSplitter:
if not self.return_each_line:
return self.aggregate_lines_to_chunks(lines_with_metadata)
else:
return lines_with_metadata
return [
Document(page_content=chunk["content"], metadata=chunk["metadata"])
for chunk in lines_with_metadata
]
# should be in newer Python versions (3.10+)

View File

@ -694,14 +694,14 @@ def test_md_header_text_splitter_1() -> None:
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
{
"content": "Hi this is Jim \nHi this is Joe",
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
},
{
"content": "Hi this is Molly",
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
},
Document(
page_content="Hi this is Jim \nHi this is Joe",
metadata={"Header 1": "Foo", "Header 2": "Bar"},
),
Document(
page_content="Hi this is Molly",
metadata={"Header 1": "Foo", "Header 2": "Baz"},
),
]
assert output == expected_output
@ -729,18 +729,18 @@ def test_md_header_text_splitter_2() -> None:
)
output = markdown_splitter.split_text(markdown_document)
expected_output = [
{
"content": "Hi this is Jim \nHi this is Joe",
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
},
{
"content": "Hi this is Lance",
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
},
{
"content": "Hi this is Molly",
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
},
Document(
page_content="Hi this is Jim \nHi this is Joe",
metadata={"Header 1": "Foo", "Header 2": "Bar"},
),
Document(
page_content="Hi this is Lance",
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
),
Document(
page_content="Hi this is Molly",
metadata={"Header 1": "Foo", "Header 2": "Baz"},
),
]
assert output == expected_output
@ -774,27 +774,27 @@ def test_md_header_text_splitter_3() -> None:
output = markdown_splitter.split_text(markdown_document)
expected_output = [
{
"content": "Hi this is Jim \nHi this is Joe",
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
},
{
"content": "Hi this is Lance",
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
},
{
"content": "Hi this is John",
"metadata": {
Document(
page_content="Hi this is Jim \nHi this is Joe",
metadata={"Header 1": "Foo", "Header 2": "Bar"},
),
Document(
page_content="Hi this is Lance",
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
),
Document(
page_content="Hi this is John",
metadata={
"Header 1": "Foo",
"Header 2": "Bar",
"Header 3": "Boo",
"Header 4": "Bim",
},
},
{
"content": "Hi this is Molly",
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
},
),
Document(
page_content="Hi this is Molly",
metadata={"Header 1": "Foo", "Header 2": "Baz"},
),
]
assert output == expected_output