From 5ce74b59586d64dd145465d6499053b997eac0d9 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Wed, 31 May 2023 07:11:53 -0700
Subject: [PATCH] code splitter docs (#5480)

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
---
 docs/modules/indexes/text_splitters.rst       |   8 +-
 .../examples/code_splitter.ipynb              | 337 +++++++++++++++---
 .../text_splitters/examples/html.ipynb        | 172 ---------
 .../text_splitters/examples/latex.ipynb       | 155 --------
 .../text_splitters/examples/markdown.ipynb    | 153 --------
 .../text_splitters/examples/python.ipynb      | 121 -------
 langchain/text_splitter.py                    | 332 +++++++----------
 tests/unit_tests/test_text_splitter.py        |  49 ++-
 8 files changed, 455 insertions(+), 872 deletions(-)
 delete mode 100644 docs/modules/indexes/text_splitters/examples/html.ipynb
 delete mode 100644 docs/modules/indexes/text_splitters/examples/latex.ipynb
 delete mode 100644 docs/modules/indexes/text_splitters/examples/markdown.ipynb
 delete mode 100644 docs/modules/indexes/text_splitters/examples/python.ipynb

diff --git a/docs/modules/indexes/text_splitters.rst b/docs/modules/indexes/text_splitters.rst
index 9b8b66fb..a8e037a3 100644
--- a/docs/modules/indexes/text_splitters.rst
+++ b/docs/modules/indexes/text_splitters.rst
@@ -33,10 +33,8 @@ For an introduction to the default text splitter and generic functionality see:
 Usage examples for the text splitters:
 
 - `Character <./text_splitters/examples/character_text_splitter.html>`_
-- `LaTeX <./text_splitters/examples/latex.html>`_
-- `Markdown <./text_splitters/examples/markdown.html>`_
+- `Code (including HTML, Markdown, Latex, Python, etc) <./text_splitters/examples/code_splitter.html>`_
 - `NLTK <./text_splitters/examples/nltk.html>`_
-- `Python code <./text_splitters/examples/python.html>`_
 - `Recursive Character <./text_splitters/examples/recursive_text_splitter.html>`_
 - `spaCy <./text_splitters/examples/spacy.html>`_
 - `tiktoken (OpenAI) <./text_splitters/examples/tiktoken_splitter.html>`_
@@ -49,10 +47,8 @@ Usage examples for the text splitters:
    :hidden:
 
    ./text_splitters/examples/character_text_splitter.ipynb
-   ./text_splitters/examples/latex.ipynb
-   ./text_splitters/examples/markdown.ipynb
+   ./text_splitters/examples/code_splitter.ipynb
    ./text_splitters/examples/nltk.ipynb
-   ./text_splitters/examples/python.ipynb
    ./text_splitters/examples/recursive_text_splitter.ipynb
    ./text_splitters/examples/spacy.ipynb
    ./text_splitters/examples/tiktoken_splitter.ipynb
diff --git a/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb b/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb
index c769dd4a..674159f6 100644
--- a/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb
+++ b/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -12,64 +11,94 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from langchain.text_splitter import (\n",
-    "    CodeTextSplitter,\n",
+    "    RecursiveCharacterTextSplitter,\n",
     "    Language,\n",
     ")"
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['cpp',\n",
+       " 'go',\n",
+       " 'java',\n",
+       " 'js',\n",
+       " 'php',\n",
+       " 'proto',\n",
+       " 'python',\n",
+       " 'rst',\n",
+       " 'ruby',\n",
+       " 'rust',\n",
+       " 'scala',\n",
+       " 'swift',\n",
+       " 'markdown',\n",
+       " 'latex',\n",
+       " 'html']"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "## Choose a language to use"
+    "# Full list of support languages\n",
+    "[e.value for e in Language]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "python_splitter = CodeTextSplitter(\n",
-    "    language=Language.PYTHON, chunk_size=16, chunk_overlap=0\n",
-    ")\n",
-    "js_splitter = CodeTextSplitter(\n",
-    "    language=Language.JS, chunk_size=16, chunk_overlap=0\n",
-    ")"
+    "# You can also see the separators used for a given language\n",
+    "RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Split the code"
+    "## Python\n",
+    "\n",
+    "Here's an example using the PythonTextSplitter"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Document(page_content='def', metadata={}),\n",
-       " Document(page_content='hello_world():', metadata={}),\n",
-       " Document(page_content='print(\"Hello,', metadata={}),\n",
-       " Document(page_content='World!\")', metadata={}),\n",
-       " Document(page_content='# Call the', metadata={}),\n",
-       " Document(page_content='function', metadata={}),\n",
-       " Document(page_content='hello_world()', metadata={})]"
+       "[Document(page_content='def hello_world():\\n    print(\"Hello, World!\")', metadata={}),\n",
+       " Document(page_content='# Call the function\\nhello_world()', metadata={})]"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -82,31 +111,34 @@
     "# Call the function\n",
     "hello_world()\n",
     "\"\"\"\n",
-    "\n",
+    "python_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.PYTHON, chunk_size=50, chunk_overlap=0\n",
+    ")\n",
     "python_docs = python_splitter.create_documents([PYTHON_CODE])\n",
     "python_docs"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## JS\n",
+    "Here's an example using the JS text splitter"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Document(page_content='function', metadata={}),\n",
-       " Document(page_content='helloWorld() {', metadata={}),\n",
-       " Document(page_content='console.log(\"He', metadata={}),\n",
-       " Document(page_content='llo,', metadata={}),\n",
-       " Document(page_content='World!\");', metadata={}),\n",
-       " Document(page_content='}', metadata={}),\n",
-       " Document(page_content='// Call the', metadata={}),\n",
-       " Document(page_content='function', metadata={}),\n",
-       " Document(page_content='helloWorld();', metadata={})]"
+       "[Document(page_content='function helloWorld() {\\n  console.log(\"Hello, World!\");\\n}', metadata={}),\n",
+       " Document(page_content='// Call the function\\nhelloWorld();', metadata={})]"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -121,10 +153,234 @@
     "helloWorld();\n",
     "\"\"\"\n",
     "\n",
+    "js_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.JS, chunk_size=60, chunk_overlap=0\n",
+    ")\n",
     "js_docs = js_splitter.create_documents([JS_CODE])\n",
     "js_docs"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Markdown\n",
+    "\n",
+    "Here's an example using the Markdown text splitter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "markdown_text = \"\"\"\n",
+    "# 🦜️🔗 LangChain\n",
+    "\n",
+    "⚡ Building applications with LLMs through composability ⚡\n",
+    "\n",
+    "## Quick Install\n",
+    "\n",
+    "```bash\n",
+    "# Hopefully this code block isn't split\n",
+    "pip install langchain\n",
+    "```\n",
+    "\n",
+    "As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
+    "\"\"\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='# 🦜️🔗 LangChain', metadata={}),\n",
+       " Document(page_content='⚡ Building applications with LLMs through composability ⚡', metadata={}),\n",
+       " Document(page_content='## Quick Install', metadata={}),\n",
+       " Document(page_content=\"```bash\\n# Hopefully this code block isn't split\", metadata={}),\n",
+       " Document(page_content='pip install langchain', metadata={}),\n",
+       " Document(page_content='```', metadata={}),\n",
+       " Document(page_content='As an open source project in a rapidly developing field, we', metadata={}),\n",
+       " Document(page_content='are extremely open to contributions.', metadata={})]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "md_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
+    ")\n",
+    "md_docs = md_splitter.create_documents([markdown_text])\n",
+    "md_docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Latex\n",
+    "\n",
+    "Here's an example on Latex text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "latex_text = \"\"\"\n",
+    "\\documentclass{article}\n",
+    "\n",
+    "\\begin{document}\n",
+    "\n",
+    "\\maketitle\n",
+    "\n",
+    "\\section{Introduction}\n",
+    "Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n",
+    "\n",
+    "\\subsection{History of LLMs}\n",
+    "The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n",
+    "\n",
+    "\\subsection{Applications of LLMs}\n",
+    "LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n",
+    "\n",
+    "\\end{document}\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', metadata={}),\n",
+       " Document(page_content='\\\\section{Introduction}', metadata={}),\n",
+       " Document(page_content='Large language models (LLMs) are a type of machine learning', metadata={}),\n",
+       " Document(page_content='model that can be trained on vast amounts of text data to', metadata={}),\n",
+       " Document(page_content='generate human-like language. In recent years, LLMs have', metadata={}),\n",
+       " Document(page_content='made significant advances in a variety of natural language', metadata={}),\n",
+       " Document(page_content='processing tasks, including language translation, text', metadata={}),\n",
+       " Document(page_content='generation, and sentiment analysis.', metadata={}),\n",
+       " Document(page_content='\\\\subsection{History of LLMs}', metadata={}),\n",
+       " Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,', metadata={}),\n",
+       " Document(page_content='but they were limited by the amount of data that could be', metadata={}),\n",
+       " Document(page_content='processed and the computational power available at the', metadata={}),\n",
+       " Document(page_content='time. In the past decade, however, advances in hardware and', metadata={}),\n",
+       " Document(page_content='software have made it possible to train LLMs on massive', metadata={}),\n",
+       " Document(page_content='datasets, leading to significant improvements in', metadata={}),\n",
+       " Document(page_content='performance.', metadata={}),\n",
+       " Document(page_content='\\\\subsection{Applications of LLMs}', metadata={}),\n",
+       " Document(page_content='LLMs have many applications in industry, including', metadata={}),\n",
+       " Document(page_content='chatbots, content creation, and virtual assistants. They', metadata={}),\n",
+       " Document(page_content='can also be used in academia for research in linguistics,', metadata={}),\n",
+       " Document(page_content='psychology, and computational linguistics.', metadata={}),\n",
+       " Document(page_content='\\\\end{document}', metadata={})]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "latex_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
+    ")\n",
+    "latex_docs = latex_splitter.create_documents([latex_text])\n",
+    "latex_docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## HTML\n",
+    "\n",
+    "Here's an example using an HTML text splitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "html_text = \"\"\"\n",
+    "<!DOCTYPE html>\n",
+    "<html>\n",
+    "    <head>\n",
+    "        <title>🦜️🔗 LangChain</title>\n",
+    "        <style>\n",
+    "            body {\n",
+    "                font-family: Arial, sans-serif;\n",
+    "            }\n",
+    "            h1 {\n",
+    "                color: darkblue;\n",
+    "            }\n",
+    "        </style>\n",
+    "    </head>\n",
+    "    <body>\n",
+    "        <div>\n",
+    "            <h1>🦜️🔗 LangChain</h1>\n",
+    "            <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
+    "        </div>\n",
+    "        <div>\n",
+    "            As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
+    "        </div>\n",
+    "    </body>\n",
+    "</html>\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='<!DOCTYPE html>\\n<html>\\n    <head>', metadata={}),\n",
+       " Document(page_content='<title>🦜️🔗 LangChain</title>\\n        <style>', metadata={}),\n",
+       " Document(page_content='body {', metadata={}),\n",
+       " Document(page_content='font-family: Arial, sans-serif;', metadata={}),\n",
+       " Document(page_content='}\\n            h1 {', metadata={}),\n",
+       " Document(page_content='color: darkblue;\\n            }', metadata={}),\n",
+       " Document(page_content='</style>\\n    </head>\\n    <body>\\n        <div>', metadata={}),\n",
+       " Document(page_content='<h1>🦜️🔗 LangChain</h1>', metadata={}),\n",
+       " Document(page_content='<p>⚡ Building applications with LLMs through', metadata={}),\n",
+       " Document(page_content='composability ⚡</p>', metadata={}),\n",
+       " Document(page_content='</div>\\n        <div>', metadata={}),\n",
+       " Document(page_content='As an open source project in a rapidly', metadata={}),\n",
+       " Document(page_content='developing field, we are extremely open to contributions.', metadata={}),\n",
+       " Document(page_content='</div>\\n    </body>\\n</html>', metadata={})]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "html_splitter = RecursiveCharacterTextSplitter.from_language(\n",
+    "    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n",
+    ")\n",
+    "html_docs = html_splitter.create_documents([html_text])\n",
+    "html_docs"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -135,7 +391,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "langchain",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -149,9 +405,8 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.12"
-  },
-  "orig_nbformat": 4
+   "version": "3.9.1"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2
diff --git a/docs/modules/indexes/text_splitters/examples/html.ipynb b/docs/modules/indexes/text_splitters/examples/html.ipynb
deleted file mode 100644
index 53905136..00000000
--- a/docs/modules/indexes/text_splitters/examples/html.ipynb
+++ /dev/null
@@ -1,172 +0,0 @@
-{
- "cells": [
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "80f6cd99",
-   "metadata": {},
-   "source": [
-    "# HTML\n",
-    "\n",
-    ">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n",
-    "\n",
-    "`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n",
-    "\n",
-    "1. How the text is split: by list of `HTML` specific separators\n",
-    "2. How the chunk size is measured: by number of characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "96d64839",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.text_splitter import HtmlTextSplitter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "cfb0da17",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "html_text = \"\"\"\n",
-    "<!DOCTYPE html>\n",
-    "<html>\n",
-    "    <head>\n",
-    "        <title>🦜️🔗 LangChain</title>\n",
-    "        <style>\n",
-    "            body {\n",
-    "                font-family: Arial, sans-serif;\n",
-    "            }\n",
-    "            h1 {\n",
-    "                color: darkblue;\n",
-    "            }\n",
-    "        </style>\n",
-    "    </head>\n",
-    "    <body>\n",
-    "        <div>\n",
-    "            <h1>🦜️🔗 LangChain</h1>\n",
-    "            <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
-    "        </div>\n",
-    "        <div>\n",
-    "            As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
-    "        </div>\n",
-    "    </body>\n",
-    "</html>\n",
-    "\"\"\"\n",
-    "\n",
-    "html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "d59a4fe8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "docs = html_splitter.create_documents([html_text])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "id": "cbb2e100",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='<!DOCTYPE html>\\n<html>', metadata={}),\n",
-       " Document(page_content='<title>🦜️🔗 LangChain</title>', metadata={}),\n",
-       " Document(page_content='body {\\n                font-family: Arial, sans-serif;\\n            }\\n            h1 {\\n                color: darkblue;\\n            }\\n        </style>\\n    </head>', metadata={}),\n",
-       " Document(page_content='/style>\\n    </head>', metadata={}),\n",
-       " Document(page_content='<div>\\n            <h1>🦜️🔗 LangChain</h1>\\n            <p>⚡ Building applications with LLMs through composability ⚡</p>\\n        </div>', metadata={}),\n",
-       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n        </div>\\n    </body>\\n</html>', metadata={})]"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['<!DOCTYPE html>\\n<html>',\n",
-       " '<title>🦜️🔗 LangChain</title>',\n",
-       " 'body {\\n                font-family: Arial, sans-serif;\\n            }\\n            h1 {\\n                color: darkblue;\\n            }\\n        </style>\\n    </head>',\n",
-       " '/style>\\n    </head>',\n",
-       " '<div>\\n            <h1>🦜️🔗 LangChain</h1>\\n            <p>⚡ Building applications with LLMs through composability ⚡</p>\\n        </div>',\n",
-       " 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n        </div>\\n    </body>\\n</html>']"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "html_splitter.split_text(html_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9bee7858-9175-4d99-bd30-68f2dece8601",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.10"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/modules/indexes/text_splitters/examples/latex.ipynb b/docs/modules/indexes/text_splitters/examples/latex.ipynb
deleted file mode 100644
index aaf1f575..00000000
--- a/docs/modules/indexes/text_splitters/examples/latex.ipynb
+++ /dev/null
@@ -1,155 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "3a2f572e",
-   "metadata": {},
-   "source": [
-    "# LaTeX\n",
-    "\n",
-    ">[LaTeX](https://en.wikipedia.org/wiki/LaTeX) is widely used in academia for the communication and publication of scientific documents in many fields, including mathematics, computer science, engineering, physics, chemistry, economics, linguistics, quantitative psychology, philosophy, and political science.\n",
-    "\n",
-    "`LatexTextSplitter` splits text along `LaTeX` headings, headlines, enumerations and more. It's implemented as a subclass of `RecursiveCharacterSplitter` with LaTeX-specific separators. See the source code for more details.\n",
-    "\n",
-    "1. How the text is split: by list of `LaTeX` specific tags\n",
-    "2. How the chunk size is measured: by number of characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "c2503917",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.text_splitter import LatexTextSplitter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e46b753b",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "latex_text = \"\"\"\n",
-    "\\documentclass{article}\n",
-    "\n",
-    "\\begin{document}\n",
-    "\n",
-    "\\maketitle\n",
-    "\n",
-    "\\section{Introduction}\n",
-    "Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n",
-    "\n",
-    "\\subsection{History of LLMs}\n",
-    "The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n",
-    "\n",
-    "\\subsection{Applications of LLMs}\n",
-    "LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n",
-    "\n",
-    "\\end{document}\n",
-    "\"\"\"\n",
-    "latex_splitter = LatexTextSplitter(chunk_size=400, chunk_overlap=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "73b5bd33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs = latex_splitter.create_documents([latex_text])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "e1c7fbd5",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', lookup_str='', metadata={}, lookup_index=0),\n",
-       " Document(page_content='Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.', lookup_str='', metadata={}, lookup_index=0),\n",
-       " Document(page_content='History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.', lookup_str='', metadata={}, lookup_index=0),\n",
-       " Document(page_content='Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}', lookup_str='', metadata={}, lookup_index=0)]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "40e62829-9485-414e-9ea1-e1a8fc7c88cb",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle',\n",
-       " 'Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.',\n",
-       " 'History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.',\n",
-       " 'Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}']"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "latex_splitter.split_text(latex_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7deb8f25-a062-4956-9f90-513802069667",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/modules/indexes/text_splitters/examples/markdown.ipynb b/docs/modules/indexes/text_splitters/examples/markdown.ipynb
deleted file mode 100644
index 1c784e8b..00000000
--- a/docs/modules/indexes/text_splitters/examples/markdown.ipynb
+++ /dev/null
@@ -1,153 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "80f6cd99",
-   "metadata": {},
-   "source": [
-    "# Markdown\n",
-    "\n",
-    ">[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n",
-    "\n",
-    "`MarkdownTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n",
-    "\n",
-    "1. How the text is split: by list of `markdown` specific separators\n",
-    "2. How the chunk size is measured: by number of characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "96d64839",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.text_splitter import MarkdownTextSplitter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "cfb0da17",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "markdown_text = \"\"\"\n",
-    "# 🦜️🔗 LangChain\n",
-    "\n",
-    "⚡ Building applications with LLMs through composability ⚡\n",
-    "\n",
-    "## Quick Install\n",
-    "\n",
-    "```bash\n",
-    "# Hopefully this code block isn't split\n",
-    "pip install langchain\n",
-    "```\n",
-    "\n",
-    "As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
-    "\"\"\"\n",
-    "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "d59a4fe8",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "docs = markdown_splitter.create_documents([markdown_text])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "cbb2e100",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', metadata={}),\n",
-       " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", metadata={}),\n",
-       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', metadata={})]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡',\n",
-       " \"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\",\n",
-       " 'As an open source project in a rapidly developing field, we are extremely open to contributions.']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "markdown_splitter.split_text(markdown_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9bee7858-9175-4d99-bd30-68f2dece8601",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/docs/modules/indexes/text_splitters/examples/python.ipynb b/docs/modules/indexes/text_splitters/examples/python.ipynb
deleted file mode 100644
index ae48b7f1..00000000
--- a/docs/modules/indexes/text_splitters/examples/python.ipynb
+++ /dev/null
@@ -1,121 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "c350765d",
-   "metadata": {},
-   "source": [
-    "# Python Code\n",
-    "\n",
-    "`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.\n",
-    "\n",
-    "1. How the text is split: by list of python specific separators\n",
-    "2. How the chunk size is measured: by number of characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "1703463f",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.text_splitter import PythonCodeTextSplitter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "f17a1854",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "python_text = \"\"\"\n",
-    "class Foo:\n",
-    "\n",
-    "    def bar():\n",
-    "    \n",
-    "    \n",
-    "def foo():\n",
-    "\n",
-    "def testing_func_with_long_name():\n",
-    "\n",
-    "def bar():\n",
-    "\"\"\"\n",
-    "python_splitter = PythonCodeTextSplitter(chunk_size=40, chunk_overlap=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "8cc33770",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "docs = python_splitter.create_documents([python_text])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "f5f70775",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='class Foo:\\n\\n    def bar():', metadata={}),\n",
-       " Document(page_content='def foo():', metadata={}),\n",
-       " Document(page_content='def testing_func_with_long_name():', metadata={}),\n",
-       " Document(page_content='def bar():', metadata={})]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "docs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6e096d42",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
-  },
-  "vscode": {
-   "interpreter": {
-    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py
index 54ee22b8..a88acf33 100644
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@@ -293,6 +293,24 @@ class TokenTextSplitter(TextSplitter):
         return splits
 
 
+class Language(str, Enum):
+    CPP = "cpp"
+    GO = "go"
+    JAVA = "java"
+    JS = "js"
+    PHP = "php"
+    PROTO = "proto"
+    PYTHON = "python"
+    RST = "rst"
+    RUBY = "ruby"
+    RUST = "rust"
+    SCALA = "scala"
+    SWIFT = "swift"
+    MARKDOWN = "markdown"
+    LATEX = "latex"
+    HTML = "html"
+
+
 class RecursiveCharacterTextSplitter(TextSplitter):
     """Implementation of splitting text that looks at characters.
 
@@ -350,205 +368,15 @@ class RecursiveCharacterTextSplitter(TextSplitter):
     def split_text(self, text: str) -> List[str]:
         return self._split_text(text, self._separators)
 
-
-class NLTKTextSplitter(TextSplitter):
-    """Implementation of splitting text that looks at sentences using NLTK."""
-
-    def __init__(self, separator: str = "\n\n", **kwargs: Any):
-        """Initialize the NLTK splitter."""
-        super().__init__(**kwargs)
-        try:
-            from nltk.tokenize import sent_tokenize
-
-            self._tokenizer = sent_tokenize
-        except ImportError:
-            raise ImportError(
-                "NLTK is not installed, please install it with `pip install nltk`."
-            )
-        self._separator = separator
-
-    def split_text(self, text: str) -> List[str]:
-        """Split incoming text and return chunks."""
-        # First we naively split the large input into a bunch of smaller ones.
-        splits = self._tokenizer(text)
-        return self._merge_splits(splits, self._separator)
-
-
-class SpacyTextSplitter(TextSplitter):
-    """Implementation of splitting text that looks at sentences using Spacy."""
-
-    def __init__(
-        self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
-    ):
-        """Initialize the spacy text splitter."""
-        super().__init__(**kwargs)
-        try:
-            import spacy
-        except ImportError:
-            raise ImportError(
-                "Spacy is not installed, please install it with `pip install spacy`."
-            )
-        self._tokenizer = spacy.load(pipeline)
-        self._separator = separator
-
-    def split_text(self, text: str) -> List[str]:
-        """Split incoming text and return chunks."""
-        splits = (str(s) for s in self._tokenizer(text).sents)
-        return self._merge_splits(splits, self._separator)
-
-
-class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
-    """Attempts to split the text along Markdown-formatted headings."""
-
-    def __init__(self, **kwargs: Any):
-        """Initialize a MarkdownTextSplitter."""
-        separators = [
-            # First, try to split along Markdown headings (starting with level 2)
-            "\n## ",
-            "\n### ",
-            "\n#### ",
-            "\n##### ",
-            "\n###### ",
-            # Note the alternative syntax for headings (below) is not handled here
-            # Heading level 2
-            # ---------------
-            # End of code block
-            "```\n\n",
-            # Horizontal lines
-            "\n\n***\n\n",
-            "\n\n---\n\n",
-            "\n\n___\n\n",
-            # Note that this splitter doesn't handle horizontal lines defined
-            # by *three or more* of ***, ---, or ___, but this is not handled
-            "\n\n",
-            "\n",
-            " ",
-            "",
-        ]
-        super().__init__(separators=separators, **kwargs)
-
-
-class LatexTextSplitter(RecursiveCharacterTextSplitter):
-    """Attempts to split the text along Latex-formatted layout elements."""
-
-    def __init__(self, **kwargs: Any):
-        """Initialize a LatexTextSplitter."""
-        separators = [
-            # First, try to split along Latex sections
-            "\n\\chapter{",
-            "\n\\section{",
-            "\n\\subsection{",
-            "\n\\subsubsection{",
-            # Now split by environments
-            "\n\\begin{enumerate}",
-            "\n\\begin{itemize}",
-            "\n\\begin{description}",
-            "\n\\begin{list}",
-            "\n\\begin{quote}",
-            "\n\\begin{quotation}",
-            "\n\\begin{verse}",
-            "\n\\begin{verbatim}",
-            ## Now split by math environments
-            "\n\\begin{align}",
-            "$$",
-            "$",
-            # Now split by the normal type of lines
-            " ",
-            "",
-        ]
-        super().__init__(separators=separators, **kwargs)
-
-
-class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
-    """Attempts to split the text along Python syntax."""
-
-    def __init__(self, **kwargs: Any):
-        """Initialize a PythonCodeTextSplitter."""
-        separators = [
-            # First, try to split along class definitions
-            "\nclass ",
-            "\ndef ",
-            "\n\tdef ",
-            # Now split by the normal type of lines
-            "\n\n",
-            "\n",
-            " ",
-            "",
-        ]
-        super().__init__(separators=separators, **kwargs)
-
-
-class HtmlTextSplitter(RecursiveCharacterTextSplitter):
-    """Attempts to split the text along HTML layout elements."""
-
-    def __init__(self, **kwargs: Any):
-        """Initialize a HtmlTextSplitter."""
-        separators = [
-            # First, try to split along HTML tags
-            "<body>",
-            "<div>",
-            "<p>",
-            "<br>",
-            "<li>",
-            "<h1>",
-            "<h2>",
-            "<h3>",
-            "<h4>",
-            "<h5>",
-            "<h6>",
-            "<span>",
-            "<table>",
-            "<tr>",
-            "<td>",
-            "<th>",
-            "<ul>",
-            "<ol>",
-            "<header>",
-            "<footer>",
-            "<nav>",
-            # Head
-            "<head>",
-            "<style>",
-            "<script>",
-            "<meta>",
-            "<title>",
-            "",
-        ]
-        super().__init__(separators=separators, **kwargs)
-
-
-class Language(str, Enum):
-    CPP = "cpp"
-    GO = "go"
-    JAVA = "java"
-    JS = "js"
-    PHP = "php"
-    PROTO = "proto"
-    PYTHON = "python"
-    RST = "rst"
-    RUBY = "ruby"
-    RUST = "rust"
-    SCALA = "scala"
-    SWIFT = "swift"
-    MARKDOWN = "markdown"
-    LATEX = "latex"
-
-
-class CodeTextSplitter(RecursiveCharacterTextSplitter):
-    def __init__(self, language: Language, **kwargs: Any):
-        """
-        A generic code text splitter supporting many programming languages.
-        Example:
-            splitter = CodeTextSplitter(
-                language=Language.JAVA
-            )
-        Args:
-            Language: The programming language to use
-        """
-        separators = self._get_separators_for_language(language)
-        super().__init__(separators=separators, **kwargs)
-
-    def _get_separators_for_language(self, language: Language) -> List[str]:
+    @classmethod
+    def from_language(
+        cls, language: Language, **kwargs: Any
+    ) -> RecursiveCharacterTextSplitter:
+        separators = cls.get_separators_for_language(language)
+        return cls(separators=separators, **kwargs)
+
+    @staticmethod
+    def get_separators_for_language(language: Language) -> List[str]:
         if language == Language.CPP:
             return [
                 # Split along class definitions
@@ -821,8 +649,114 @@ class CodeTextSplitter(RecursiveCharacterTextSplitter):
                 " ",
                 "",
             ]
+        elif language == Language.HTML:
+            return [
+                # First, try to split along HTML tags
+                "<body>",
+                "<div>",
+                "<p>",
+                "<br>",
+                "<li>",
+                "<h1>",
+                "<h2>",
+                "<h3>",
+                "<h4>",
+                "<h5>",
+                "<h6>",
+                "<span>",
+                "<table>",
+                "<tr>",
+                "<td>",
+                "<th>",
+                "<ul>",
+                "<ol>",
+                "<header>",
+                "<footer>",
+                "<nav>",
+                # Head
+                "<head>",
+                "<style>",
+                "<script>",
+                "<meta>",
+                "<title>",
+                "",
+            ]
         else:
             raise ValueError(
                 f"Language {language} is not supported! "
                 f"Please choose from {list(Language)}"
             )
+
+
+class NLTKTextSplitter(TextSplitter):
+    """Implementation of splitting text that looks at sentences using NLTK."""
+
+    def __init__(self, separator: str = "\n\n", **kwargs: Any):
+        """Initialize the NLTK splitter."""
+        super().__init__(**kwargs)
+        try:
+            from nltk.tokenize import sent_tokenize
+
+            self._tokenizer = sent_tokenize
+        except ImportError:
+            raise ImportError(
+                "NLTK is not installed, please install it with `pip install nltk`."
+            )
+        self._separator = separator
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        # First we naively split the large input into a bunch of smaller ones.
+        splits = self._tokenizer(text)
+        return self._merge_splits(splits, self._separator)
+
+
+class SpacyTextSplitter(TextSplitter):
+    """Implementation of splitting text that looks at sentences using Spacy."""
+
+    def __init__(
+        self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any
+    ):
+        """Initialize the spacy text splitter."""
+        super().__init__(**kwargs)
+        try:
+            import spacy
+        except ImportError:
+            raise ImportError(
+                "Spacy is not installed, please install it with `pip install spacy`."
+            )
+        self._tokenizer = spacy.load(pipeline)
+        self._separator = separator
+
+    def split_text(self, text: str) -> List[str]:
+        """Split incoming text and return chunks."""
+        splits = (str(s) for s in self._tokenizer(text).sents)
+        return self._merge_splits(splits, self._separator)
+
+
+# For backwards compatibility
+class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Python syntax."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a PythonCodeTextSplitter."""
+        seperators = self.get_separators_for_language(Language.PYTHON)
+        super().__init__(separators=seperators, **kwargs)
+
+
+class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Markdown-formatted headings."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a MarkdownTextSplitter."""
+        seperators = self.get_separators_for_language(Language.MARKDOWN)
+        super().__init__(separators=seperators, **kwargs)
+
+
+class LatexTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along Latex-formatted layout elements."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a LatexTextSplitter."""
+        seperators = self.get_separators_for_language(Language.LATEX)
+        super().__init__(separators=seperators, **kwargs)
diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py
index 89d0e08d..eb24573b 100644
--- a/tests/unit_tests/test_text_splitter.py
+++ b/tests/unit_tests/test_text_splitter.py
@@ -4,7 +4,6 @@ import pytest
 from langchain.docstore.document import Document
 from langchain.text_splitter import (
     CharacterTextSplitter,
-    CodeTextSplitter,
     Language,
     PythonCodeTextSplitter,
     RecursiveCharacterTextSplitter,
@@ -202,8 +201,8 @@ CHUNK_SIZE = 16
 
 
 def test_python_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.PYTHON, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 def hello_world():
@@ -225,8 +224,8 @@ hello_world()
 
 
 def test_golang_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.GO, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 package main
@@ -258,8 +257,8 @@ func main() {
 
 
 def test_rst_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.RST, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 Sample Document
@@ -294,8 +293,8 @@ Lists
 
 
 def test_proto_file_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.PROTO, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 syntax = "proto3";
@@ -328,8 +327,8 @@ message Person {
 
 
 def test_javascript_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.JS, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 function helloWorld() {
@@ -354,8 +353,8 @@ helloWorld();
 
 
 def test_java_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 public class HelloWorld {
@@ -380,8 +379,8 @@ public class HelloWorld {
 
 
 def test_cpp_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 #include <iostream>
@@ -405,8 +404,8 @@ int main() {
 
 
 def test_scala_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.SCALA, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 object HelloWorld {
@@ -430,8 +429,8 @@ object HelloWorld {
 
 
 def test_ruby_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.RUBY, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 def hello_world
@@ -451,8 +450,8 @@ hello_world
 
 
 def test_php_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.PHP, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 <?php
@@ -478,8 +477,8 @@ hello_world();
 
 
 def test_swift_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.SWIFT, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 func helloWorld() {
@@ -500,8 +499,8 @@ helloWorld()
 
 
 def test_rust_code_splitter() -> None:
-    splitter = CodeTextSplitter(
-        language=Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
+    splitter = RecursiveCharacterTextSplitter.from_language(
+        Language.RUST, chunk_size=CHUNK_SIZE, chunk_overlap=0
     )
     code = """
 fn main() {