MD header text splitter returns Documents (#6571)

Return `Documents` from MD header text splitter to simplify UX. Updates the test as well as example notebooks.
2023-06-22 09:25:38 -07:00 · 2023-06-22 09:25:38 -07:00 · 30f7288082
commit 30f7288082
parent 3436da65a4
4 changed files with 116 additions and 190 deletions
--- a/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb
+++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/markdown_header_metadata.ipynb
@ -50,8 +50,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "id": "19c044f0",
+   "execution_count": 2,
+   "id": "ceb3c1fb",
   "metadata": {},
   "outputs": [],
   "source": [
@ -65,13 +65,16 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'content': 'Hi this is Jim  \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n",
-      "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n",
-      "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n"
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Hi this is Jim  \\nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}),\n",
+       " Document(page_content='Hi this is Lance', metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}),\n",
+       " Document(page_content='Hi this is Molly', metadata={'Header 1': 'Foo', 'Header 2': 'Baz'})]"
      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
   "source": [
@ -85,8 +88,28 @@
    "\n",
    "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
    "md_header_splits = markdown_splitter.split_text(markdown_document)\n",
-    "for split in md_header_splits:\n",
-    "    print(split)"
+    "md_header_splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "aac1738c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "langchain.schema.Document"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(md_header_splits[0])"
   ]
  },
  {
@ -99,10 +122,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
   "id": "480e0e3a",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9]', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
+       " Document(page_content='Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', metadata={'Header 1': 'Intro', 'Header 2': 'History'}),\n",
+       " Document(page_content='As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for  \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.  \\n#### Standardization', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
+       " Document(page_content='#### Standardization  \\nFrom 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', metadata={'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}),\n",
+       " Document(page_content='Implementations of Markdown are available for over a dozen programming languages.', metadata={'Header 1': 'Intro', 'Header 2': 'Implementations'})]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "markdown_document = \"# Intro \\n\\n    ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.\"\n",
    "\n",
@ -117,60 +155,13 @@
    "\n",
    "# Char-level splits\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
-    "chunk_size = 10\n",
-    "chunk_overlap = 0\n",
+    "chunk_size = 250\n",
+    "chunk_overlap = 30\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
    "\n",
-    "# Split within each header group\n",
-    "all_splits=[]\n",
-    "all_metadatas=[]    \n",
-    "for header_group in md_header_splits:\n",
-    "    _splits = text_splitter.split_text(header_group['content'])\n",
-    "    _metadatas = [header_group['metadata'] for _ in _splits]\n",
-    "    all_splits += _splits\n",
-    "    all_metadatas += _metadatas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "3f5d775e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Markdown[9'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_splits[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "33ab0d5c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'Header 1': 'Intro', 'Header 2': 'History'}"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_metadatas[0]"
+    "# Split\n",
+    "splits = text_splitter.split_documents(md_header_splits)\n",
+    "splits"
   ]
  }
 ],
--- a/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb
+++ b/docs/extras/use_cases/question_answering/document-context-aware-QA.ipynb
@ -9,11 +9,11 @@
    "\n",
    "Text splitting for vector storage often uses sentences or other delimiters [to keep related text together](https://www.pinecone.io/learn/chunking-strategies/). \n",
    "\n",
-    "But many documents (such as Markdown) have structure (headers) that can be explicitly used in splitting. \n",
+    "But many documents (such as `Markdown` files) have structure (headers) that can be explicitly used in splitting. \n",
    "\n",
-    "We added a new text splitter for Markdown files that lets a user split based specified headers. \n",
+    "The `MarkdownHeaderTextSplitter` lets a user split `Markdown` files files based on specified headers. \n",
    "\n",
-    "This results in chunks that retain the header(s) that it came from (e.g., Introduction) in the chunk metadata.\n",
+    "This results in chunks that retain the header(s) that it came from in the metadata.\n",
    "\n",
    "This works nicely w/ `SelfQueryRetriever`.\n",
    "\n",
@ -30,19 +30,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
-   "id": "cda52c2c",
+   "execution_count": null,
+   "id": "2e587f65",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/31treehaus/miniconda3/envs/langchain-new/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.4) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Load Notion page as a markdownfile file\n",
    "from langchain.document_loaders import NotionDirectoryLoader\n",
@ -54,22 +45,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
-   "id": "730b84f2",
+   "execution_count": null,
+   "id": "1cd3fd7e",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'content': 'We previously introduced [auto-evaluator](https://blog.langchain.dev/auto-evaluator-opportunities/), an open-source tool for grading LLM question-answer chains. Here, we extend auto-evaluator with a [lightweight Streamlit app](https://github.com/langchain-ai/auto-evaluator/tree/main/streamlit) that can connect to any existing Pinecone index. We add the ability to test metadata filtering using `SelfQueryRetriever` as well as some other approaches that we’ve found to be useful, as discussed below.  \\n[ret_trim.mov](Auto-Evaluation%20of%20Metadata%20Filtering%2018502448c85240828f33716740f9574b/ret_trim.mov)',\n",
-       " 'metadata': {'Section': 'Evaluation'}}"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Let's create groups based on the section headers in our page\n",
    "from langchain.text_splitter import MarkdownHeaderTextSplitter\n",
@ -77,8 +56,7 @@
    "    (\"###\", \"Section\"),\n",
    "]\n",
    "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)\n",
-    "md_header_splits = markdown_splitter.split_text(md_file)\n",
-    "md_header_splits[3]"
+    "md_header_splits = markdown_splitter.split_text(md_file)"
   ]
  },
  {
@ -86,7 +64,7 @@
   "id": "4f73a609",
   "metadata": {},
   "source": [
-    "Now, we split the text in each header group and keep the group as metadata."
+    "Now, perform text splitting on the header grouped documents. "
   ]
  },
  {
@ -101,57 +79,7 @@
    "chunk_size = 500\n",
    "chunk_overlap = 0\n",
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
-    " \n",
-    "# Create splits within each header group and combine them\n",
-    "all_splits=[]\n",
-    "all_metadatas=[]\n",
-    "for header_group in md_header_splits:\n",
-    "    _splits = text_splitter.split_text(header_group['content'])\n",
-    "    _metadatas = [header_group['metadata'] for _ in _splits]\n",
-    "    all_splits += _splits\n",
-    "    all_metadatas += _metadatas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "id": "7424f78b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'In these cases, semantic search will look for the concept `episode 53` in the chunks, but instead we simply want to filter the chunks for `episode 53` and then perform semantic search to extract those that best summarize the episode. Metadata filtering does this, so long as we 1) we have a metadata filter for episode number and 2) we can extract the value from the query (e.g., `54` or `252`) that we want to extract. The LangChain `SelfQueryRetriever` does the latter (see'"
-      ]
-     },
-     "execution_count": 25,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_splits[6]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "08f5db3a",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'Section': 'Motivation'}"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "all_metadatas[6]"
+    "all_splits = text_splitter.split_documents(md_header_splits)"
   ]
  },
  {
@ -183,7 +111,7 @@
   "source": [
    "# Build vectorstore and keep the metadata\n",
    "from langchain.vectorstores import Chroma\n",
-    "vectorstore = Chroma.from_texts(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
+    "vectorstore = Chroma.from_documents(texts=all_splits,metadatas=all_metadatas,embedding=OpenAIEmbeddings())"
   ]
  },
  {
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -288,7 +288,7 @@ class MarkdownHeaderTextSplitter:
            headers_to_split_on, key=lambda split: len(split[0]), reverse=True
        )

-    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]:
+    def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]:
        """Combine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
@ -307,9 +307,13 @@ class MarkdownHeaderTextSplitter:
            else:
                # Otherwise, append the current line to the aggregated list
                aggregated_chunks.append(line)
-        return aggregated_chunks

-    def split_text(self, text: str) -> List[LineType]:
+        return [
+            Document(page_content=chunk["content"], metadata=chunk["metadata"])
+            for chunk in aggregated_chunks
+        ]
+
+    def split_text(self, text: str) -> List[Document]:
        """Split markdown file
        Args:
            text: Markdown file"""
@ -401,7 +405,10 @@ class MarkdownHeaderTextSplitter:
        if not self.return_each_line:
            return self.aggregate_lines_to_chunks(lines_with_metadata)
        else:
-            return lines_with_metadata
+            return [
+                Document(page_content=chunk["content"], metadata=chunk["metadata"])
+                for chunk in lines_with_metadata
+            ]


 # should be in newer Python versions (3.10+)
--- a/tests/unit_tests/test_text_splitter.py
+++ b/tests/unit_tests/test_text_splitter.py
@ -694,14 +694,14 @@ def test_md_header_text_splitter_1() -> None:
    )
    output = markdown_splitter.split_text(markdown_document)
    expected_output = [
-        {
-            "content": "Hi this is Jim  \nHi this is Joe",
-            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
-        },
-        {
-            "content": "Hi this is Molly",
-            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
-        },
+        Document(
+            page_content="Hi this is Jim  \nHi this is Joe",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+        Document(
+            page_content="Hi this is Molly",
+            metadata={"Header 1": "Foo", "Header 2": "Baz"},
+        ),
    ]
    assert output == expected_output

@ -729,18 +729,18 @@ def test_md_header_text_splitter_2() -> None:
    )
    output = markdown_splitter.split_text(markdown_document)
    expected_output = [
-        {
-            "content": "Hi this is Jim  \nHi this is Joe",
-            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
-        },
-        {
-            "content": "Hi this is Lance",
-            "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
-        },
-        {
-            "content": "Hi this is Molly",
-            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
-        },
+        Document(
+            page_content="Hi this is Jim  \nHi this is Joe",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+        Document(
+            page_content="Hi this is Lance",
+            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        ),
+        Document(
+            page_content="Hi this is Molly",
+            metadata={"Header 1": "Foo", "Header 2": "Baz"},
+        ),
    ]
    assert output == expected_output

@ -774,27 +774,27 @@ def test_md_header_text_splitter_3() -> None:
    output = markdown_splitter.split_text(markdown_document)

    expected_output = [
-        {
-            "content": "Hi this is Jim  \nHi this is Joe",
-            "metadata": {"Header 1": "Foo", "Header 2": "Bar"},
-        },
-        {
-            "content": "Hi this is Lance",
-            "metadata": {"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
-        },
-        {
-            "content": "Hi this is John",
-            "metadata": {
+        Document(
+            page_content="Hi this is Jim  \nHi this is Joe",
+            metadata={"Header 1": "Foo", "Header 2": "Bar"},
+        ),
+        Document(
+            page_content="Hi this is Lance",
+            metadata={"Header 1": "Foo", "Header 2": "Bar", "Header 3": "Boo"},
+        ),
+        Document(
+            page_content="Hi this is John",
+            metadata={
                "Header 1": "Foo",
                "Header 2": "Bar",
                "Header 3": "Boo",
                "Header 4": "Bim",
            },
-        },
-        {
-            "content": "Hi this is Molly",
-            "metadata": {"Header 1": "Foo", "Header 2": "Baz"},
-        },
+        ),
+        Document(
+            page_content="Hi this is Molly",
+            metadata={"Header 1": "Foo", "Header 2": "Baz"},
+        ),
    ]

    assert output == expected_output