forked from Archives/langchain
MD header text splitter returns Documents (#6571)
Return `Documents` from MD header text splitter to simplify UX. Updates the test as well as example notebooks.
This commit is contained in:
parent
3436da65a4
commit
30f7288082
@ -50,8 +50,8 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "19c044f0",
|
||||
"execution_count": 2,
|
||||
"id": "ceb3c1fb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -65,13 +65,16 @@
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
|
||||
"{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
|
||||
"{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
|
||||
" Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
|
||||
" Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@ -85,8 +88,28 @@
|
||||
"\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(markdown_document)\n",
|
||||
"for split in md_header_splits:\n",
|
||||
" print(split)"
|
||||
"md_header_splits"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "aac1738c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"langchain.schema.Document"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"type(md_header_splits[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -99,10 +122,25 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"id": "480e0e3a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
|
||||
" Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='#### Standardization \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
|
||||
" Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"markdown_document = \"# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
|
||||
"\n",
|
||||
@ -117,60 +155,13 @@
|
||||
"\n",
|
||||
"# Char-level splits\n",
|
||||
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
|
||||
"chunk_size = 10\n",
|
||||
"chunk_overlap = 0\n",
|
||||
"chunk_size = 250\n",
|
||||
"chunk_overlap = 30\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
"\n",
|
||||
"# Split within each header group\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[] \n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "3f5d775e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Markdown[9'"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "33ab0d5c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Header 1': 'Intro', 'Header 2': 'History'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[0]"
|
||||
"# Split\n",
|
||||
"splits = text_splitter.split_documents(md_header_splits)\n",
|
||||
"splits"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -9,11 +9,11 @@
|
||||
"\n",
|
||||
"Text splitting for vector storage often uses sentences or other delimiters [to keep related text together](https://www.pinecone.io/learn/chunking-strategies/). \n",
|
||||
"\n",
|
||||
"But many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting. \n",
|
||||
"But many documents (such as `Markdown` files) have structure (headers) that can be explicitly used in splitting. \n",
|
||||
"\n",
|
||||
"We added a new text splitter for Markdown files that lets a user split based specified headers. \n",
|
||||
"The `MarkdownHeaderTextSplitter` lets a user split `Markdown` files files based on specified headers. \n",
|
||||
"\n",
|
||||
"This results in chunks that retain the header(s) that it came from (e.g., Introduction) in the chunk metadata.\n",
|
||||
"This results in chunks that retain the header(s) that it came from in the metadata.\n",
|
||||
"\n",
|
||||
"This works nicely w/ `SelfQueryRetriever`.\n",
|
||||
"\n",
|
||||
@ -30,19 +30,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "cda52c2c",
|
||||
"execution_count": null,
|
||||
"id": "2e587f65",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/31treehaus/miniconda3/envs/langchain-new/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load Notion page as a markdownfile file\n",
|
||||
"from langchain.document_loaders import NotionDirectoryLoader\n",
|
||||
@ -54,22 +45,10 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "730b84f2",
|
||||
"execution_count": null,
|
||||
"id": "1cd3fd7e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below. \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n",
|
||||
" 'metadata': {'Section': 'Evaluation'}}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Let's create groups based on the section headers in our page\n",
|
||||
"from langchain.text_splitter import MarkdownHeaderTextSplitter\n",
|
||||
@ -77,8 +56,7 @@
|
||||
" (\"###\", \"Section\"),\n",
|
||||
"]\n",
|
||||
"markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
|
||||
"md_header_splits = markdown_splitter.split_text(md_file)\n",
|
||||
"md_header_splits[3]"
|
||||
"md_header_splits = markdown_splitter.split_text(md_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -86,7 +64,7 @@
|
||||
"id": "4f73a609",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now, we split the text in each header group and keep the group as metadata."
|
||||
"Now, perform text splitting on the header grouped documents. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -101,57 +79,7 @@
|
||||
"chunk_size = 500\n",
|
||||
"chunk_overlap = 0\n",
|
||||
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
|
||||
" \n",
|
||||
"# Create splits within each header group and combine them\n",
|
||||
"all_splits=[]\n",
|
||||
"all_metadatas=[]\n",
|
||||
"for header_group in md_header_splits:\n",
|
||||
" _splits = text_splitter.split_text(header_group['content'])\n",
|
||||
" _metadatas = [header_group['metadata'] for _ in _splits]\n",
|
||||
" all_splits += _splits\n",
|
||||
" all_metadatas += _metadatas"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "7424f78b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_splits[6]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "08f5db3a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'Section': 'Motivation'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_metadatas[6]"
|
||||
"all_splits = text_splitter.split_documents(md_header_splits)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -183,7 +111,7 @@
|
||||
"source": [
|
||||
"# Build vectorstore and keep the metadata\n",
|
||||
"from langchain.vectorstores import Chroma\n",
|
||||
"vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
|
||||
"vectorstore = Chroma.from_documents(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -288,7 +288,7 @@ class MarkdownHeaderTextSplitter:
|
||||
headers_to_split_on, key=lambda split: len(split[0]), reverse=True
|
||||
)
|
||||
|
||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]:
|
||||
def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
|
||||
"""Combine lines with common metadata into chunks
|
||||
Args:
|
||||
lines: Line of text / associated header metadata
|
||||
@ -307,9 +307,13 @@ class MarkdownHeaderTextSplitter:
|
||||
else:
|
||||
# Otherwise, append the current line to the aggregated list
|
||||
aggregated_chunks.append(line)
|
||||
return aggregated_chunks
|
||||
|
||||
def split_text(self, text: str) -> List[LineType]:
|
||||
return [
|
||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
||||
for chunk in aggregated_chunks
|
||||
]
|
||||
|
||||
def split_text(self, text: str) -> List[Document]:
|
||||
"""Split markdown file
|
||||
Args:
|
||||
text: Markdown file"""
|
||||
@ -401,7 +405,10 @@ class MarkdownHeaderTextSplitter:
|
||||
if not self.return_each_line:
|
||||
return self.aggregate_lines_to_chunks(lines_with_metadata)
|
||||
else:
|
||||
return lines_with_metadata
|
||||
return [
|
||||
Document(page_content=chunk["content"], metadata=chunk["metadata"])
|
||||
for chunk in lines_with_metadata
|
||||
]
|
||||
|
||||
|
||||
# should be in newer Python versions (3.10+)
|
||||
|
@ -694,14 +694,14 @@ def test_md_header_text_splitter_1() -> None:
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
{
|
||||
"content": "Hi this is Jim \nHi this is Joe",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is Molly",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||
},
|
||||
Document(
|
||||
page_content="Hi this is Jim \nHi this is Joe",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is Molly",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||
),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
@ -729,18 +729,18 @@ def test_md_header_text_splitter_2() -> None:
|
||||
)
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
expected_output = [
|
||||
{
|
||||
"content": "Hi this is Jim \nHi this is Joe",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is Lance",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is Molly",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||
},
|
||||
Document(
|
||||
page_content="Hi this is Jim \nHi this is Joe",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is Lance",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is Molly",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||
),
|
||||
]
|
||||
assert output == expected_output
|
||||
|
||||
@ -774,27 +774,27 @@ def test_md_header_text_splitter_3() -> None:
|
||||
output = markdown_splitter.split_text(markdown_document)
|
||||
|
||||
expected_output = [
|
||||
{
|
||||
"content": "Hi this is Jim \nHi this is Joe",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Bar"},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is Lance",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is John",
|
||||
"metadata": {
|
||||
Document(
|
||||
page_content="Hi this is Jim \nHi this is Joe",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar"},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is Lance",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is John",
|
||||
metadata={
|
||||
"Header 1": "Foo",
|
||||
"Header 2": "Bar",
|
||||
"Header 3": "Boo",
|
||||
"Header 4": "Bim",
|
||||
},
|
||||
},
|
||||
{
|
||||
"content": "Hi this is Molly",
|
||||
"metadata": {"Header 1": "Foo", "Header 2": "Baz"},
|
||||
},
|
||||
),
|
||||
Document(
|
||||
page_content="Hi this is Molly",
|
||||
metadata={"Header 1": "Foo", "Header 2": "Baz"},
|
||||
),
|
||||
]
|
||||
|
||||
assert output == expected_output
|
||||
|
Loading…
Reference in New Issue
Block a user