From 28781a6213207bdf0794905b27222035c0d51cbd Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 19 Feb 2023 21:31:58 -0800
Subject: [PATCH] Harrison/markdown splitter (#1169)

Co-authored-by: Michael Chen <flamingdescent@gmail.com>
Co-authored-by: Michael Chen <michaelchen@stripe.com>
---
 .../combine_docs_examples/textsplitter.ipynb  | 156 +++++++++++++++++-
 langchain/text_splitter.py                    |  50 ++++++
 2 files changed, 204 insertions(+), 2 deletions(-)

diff --git a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
index a5ccb781..c6a86ffc 100644
--- a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
+++ b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
@@ -524,10 +524,162 @@
     "print(texts[0])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c24dbbb7",
+   "metadata": {},
+   "source": [
+    "# Markdown Text Splitter\n",
+    "\n",
+    "MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "593e490c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import MarkdownTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "89a9a3ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "markdown_text = \"\"\"\n",
+    "# 🦜️🔗 LangChain\n",
+    "\n",
+    "⚡ Building applications with LLMs through composability ⚡\n",
+    "\n",
+    "## Quick Install\n",
+    "\n",
+    "```bash\n",
+    "# Hopefully this code block isn't split\n",
+    "pip install langchain\n",
+    "```\n",
+    "\n",
+    "As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
+    "\"\"\"\n",
+    "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "241f0719",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = markdown_splitter.create_documents([markdown_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7789e643",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04a6392a",
+   "metadata": {},
+   "source": [
+    "# Python Code Text Splitter\n",
+    "\n",
+    "PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8fb36bc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import PythonCodeTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "d359f3dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "python_text = \"\"\"\n",
+    "class Foo:\n",
+    "\n",
+    "    def bar():\n",
+    "    \n",
+    "    \n",
+    "def foo():\n",
+    "\n",
+    "def testing_func():\n",
+    "\n",
+    "def bar():\n",
+    "\"\"\"\n",
+    "python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "26b79cd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = python_splitter.create_documents([python_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b1749579",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Foo:\\n\\n    def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
+       " Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0905c1de",
+   "id": "6e6c8cc7",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -549,7 +701,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
   },
   "vscode": {
    "interpreter": {
diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py
index eb818436..76f82f14 100644
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
         """Split incoming text and return chunks."""
         splits = (str(s) for s in self._tokenizer(text).sents)
         return self._merge_splits(splits, self._separator)
+
+
+class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Markdown-formatted headings."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a MarkdownTextSplitter."""
+        separators = [
+            # First, try to split along Markdown headings (starting with level 2)
+            "\n## ",
+            "\n### ",
+            "\n#### ",
+            "\n##### ",
+            "\n###### ",
+            # Note the alternative syntax for headings (below) is not handled here
+            # Heading level 2
+            # ---------------
+            # End of code block
+            "```\n\n",
+            # Horizontal lines
+            "\n\n***\n\n",
+            "\n\n---\n\n",
+            "\n\n___\n\n",
+            # Note that this splitter doesn't handle horizontal lines defined
+            # by *three or more* of ***, ---, or ___, but this is not handled
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)
+
+
+class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Python syntax."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a MarkdownTextSplitter."""
+        separators = [
+            # First, try to split along class definitions
+            "\nclass ",
+            "\ndef ",
+            "\n\tdef ",
+            # Now split by the normal type of lines
+            "\n\n",
+            "\n",
+            " ",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)