Harrison/markdown splitter (#1169)

Co-authored-by: Michael Chen <flamingdescent@gmail.com> Co-authored-by: Michael Chen <michaelchen@stripe.com>
1 year ago · 28781a6213
parent 37dd34bea5
commit 28781a6213
2 changed files with 204 additions and 2 deletions
--- a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
+++ b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
@ -524,10 +524,162 @@
    "print(texts[0])"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "c24dbbb7",
+   "metadata": {},
+   "source": [
+    "# Markdown Text Splitter\n",
+    "\n",
+    "MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "593e490c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import MarkdownTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "89a9a3ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "markdown_text = \"\"\"\n",
+    "# 🦜️🔗 LangChain\n",
+    "\n",
+    "⚡ Building applications with LLMs through composability ⚡\n",
+    "\n",
+    "## Quick Install\n",
+    "\n",
+    "```bash\n",
+    "# Hopefully this code block isn't split\n",
+    "pip install langchain\n",
+    "```\n",
+    "\n",
+    "As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
+    "\"\"\"\n",
+    "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "241f0719",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = markdown_splitter.create_documents([markdown_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7789e643",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a6392a",
+   "metadata": {},
+   "source": [
+    "# Python Code Text Splitter\n",
+    "\n",
+    "PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8fb36bc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import PythonCodeTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "d359f3dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "python_text = \"\"\"\n",
+    "class Foo:\n",
+    "\n",
+    "    def bar():\n",
+    "    \n",
+    "    \n",
+    "def foo():\n",
+    "\n",
+    "def testing_func():\n",
+    "\n",
+    "def bar():\n",
+    "\"\"\"\n",
+    "python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "26b79cd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = python_splitter.create_documents([python_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b1749579",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Foo:\\n\\n    def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0905c1de",
+   "id": "6e6c8cc7",
   "metadata": {},
   "outputs": [],
   "source": []
@ -549,7 +701,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
        """Split incoming text and return chunks."""
        splits = (str(s) for s in self._tokenizer(text).sents)
        return self._merge_splits(splits, self._separator)
+
+
+class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Markdown-formatted headings."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a MarkdownTextSplitter."""
+        separators = [
+            # First, try to split along Markdown headings (starting with level 2)
+            "\n## ",
+            "\n### ",
+            "\n#### ",
+            "\n##### ",
+            "\n###### ",
+            # Note the alternative syntax for headings (below) is not handled here
+            # Heading level 2
+            # ---------------
+            # End of code block
+            "```\n\n",
+            # Horizontal lines
+            "\n\n***\n\n",
+            "\n\n---\n\n",
+            "\n\n___\n\n",
+            # Note that this splitter doesn't handle horizontal lines defined
+            # by *three or more* of ***, ---, or ___, but this is not handled
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)
+
+
+class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Python syntax."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a MarkdownTextSplitter."""
+        separators = [
+            # First, try to split along class definitions
+            "\nclass ",
+            "\ndef ",
+            "\n\tdef ",
+            # Now split by the normal type of lines
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)