Harrison/markdown splitter (#1169)

Co-authored-by: Michael Chen <flamingdescent@gmail.com> Co-authored-by: Michael Chen <michaelchen@stripe.com>
2024-11-06 03:20:49 +00:00 · 2023-02-19 21:31:58 -08:00 · 2023-02-19 21:31:58 -08:00 · 28781a6213
commit 28781a6213
parent 37dd34bea5
2 changed files with 204 additions and 2 deletions
--- a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
+++ b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb
@ -524,10 +524,162 @@
    "print(texts[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c24dbbb7",
   "metadata": {},
   "source": [
    "# Markdown Text Splitter\n",
    "\n",
    "MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "593e490c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import MarkdownTextSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "89a9a3ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "markdown_text = \"\"\"\n",
    "# 🦜️🔗 LangChain\n",
    "\n",
    "⚡ Building applications with LLMs through composability ⚡\n",
    "\n",
    "## Quick Install\n",
    "\n",
    "```bash\n",
    "# Hopefully this code block isn't split\n",
    "pip install langchain\n",
    "```\n",
    "\n",
    "As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
    "\"\"\"\n",
    "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "241f0719",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = markdown_splitter.create_documents([markdown_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7789e643",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
       " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04a6392a",
   "metadata": {},
   "source": [
    "# Python Code Text Splitter\n",
    "\n",
    "PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8fb36bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.text_splitter import PythonCodeTextSplitter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d359f3dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "python_text = \"\"\"\n",
    "class Foo:\n",
    "\n",
    "    def bar():\n",
    "    \n",
    "    \n",
    "def foo():\n",
    "\n",
    "def testing_func():\n",
    "\n",
    "def bar():\n",
    "\"\"\"\n",
    "python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "26b79cd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = python_splitter.create_documents([python_text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b1749579",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='Foo:\\n\\n    def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
       " Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
       " Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "0905c1de",
+   "id": "6e6c8cc7",
   "metadata": {},
   "outputs": [],
   "source": []
@ -549,7 +701,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
        """Split incoming text and return chunks."""
        splits = (str(s) for s in self._tokenizer(text).sents)
        return self._merge_splits(splits, self._separator)
 class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
    """Attempts to split the text along Markdown-formatted headings."""
    def __init__(self, **kwargs: Any):
        """Initialize a MarkdownTextSplitter."""
        separators = [
            # First, try to split along Markdown headings (starting with level 2)
            "\n## ",
            "\n### ",
            "\n#### ",
            "\n##### ",
            "\n###### ",
            # Note the alternative syntax for headings (below) is not handled here
            # Heading level 2
            # ---------------
            # End of code block
            "```\n\n",
            # Horizontal lines
            "\n\n***\n\n",
            "\n\n---\n\n",
            "\n\n___\n\n",
            # Note that this splitter doesn't handle horizontal lines defined
            # by *three or more* of ***, ---, or ___, but this is not handled
            "\n\n",
            "\n",
            " ",
            "",
        ]
        super().__init__(separators=separators, **kwargs)
 class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
    """Attempts to split the text along Python syntax."""
    def __init__(self, **kwargs: Any):
        """Initialize a MarkdownTextSplitter."""
        separators = [
            # First, try to split along class definitions
            "\nclass ",
            "\ndef ",
            "\n\tdef ",
            # Now split by the normal type of lines
            "\n\n",
            "\n",
            " ",
            "",
        ]
        super().__init__(separators=separators, **kwargs)