From 28781a6213207bdf0794905b27222035c0d51cbd Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sun, 19 Feb 2023 21:31:58 -0800 Subject: [PATCH] Harrison/markdown splitter (#1169) Co-authored-by: Michael Chen Co-authored-by: Michael Chen --- .../combine_docs_examples/textsplitter.ipynb | 156 +++++++++++++++++- langchain/text_splitter.py | 50 ++++++ 2 files changed, 204 insertions(+), 2 deletions(-) diff --git a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb index a5ccb781..c6a86ffc 100644 --- a/docs/modules/utils/combine_docs_examples/textsplitter.ipynb +++ b/docs/modules/utils/combine_docs_examples/textsplitter.ipynb @@ -524,10 +524,162 @@ "print(texts[0])" ] }, + { + "cell_type": "markdown", + "id": "c24dbbb7", + "metadata": {}, + "source": [ + "# Markdown Text Splitter\n", + "\n", + "MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "593e490c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import MarkdownTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "89a9a3ea", + "metadata": {}, + "outputs": [], + "source": [ + "markdown_text = \"\"\"\n", + "# šŸ¦œļøšŸ”— LangChain\n", + "\n", + "āš” Building applications with LLMs through composability āš”\n", + "\n", + "## Quick Install\n", + "\n", + "```bash\n", + "# Hopefully this code block isn't split\n", + "pip install langchain\n", + "```\n", + "\n", + "As an open source project in a rapidly developing field, we are extremely open to contributions.\n", + "\"\"\"\n", + "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "241f0719", + "metadata": {}, + "outputs": [], + "source": [ + "docs = markdown_splitter.create_documents([markdown_text])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7789e643", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='# šŸ¦œļøšŸ”— LangChain\\n\\nāš” Building applications with LLMs through composability āš”', lookup_str='', metadata={}, lookup_index=0),\n", + " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n", + " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "id": "04a6392a", + "metadata": {}, + "source": [ + "# Python Code Text Splitter\n", + "\n", + "PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8fb36bc7", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import PythonCodeTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d359f3dc", + "metadata": {}, + "outputs": [], + "source": [ + "python_text = \"\"\"\n", + "class Foo:\n", + "\n", + " def bar():\n", + " \n", + " \n", + "def foo():\n", + "\n", + "def testing_func():\n", + "\n", + "def bar():\n", + "\"\"\"\n", + "python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "26b79cd9", + "metadata": {}, + "outputs": [], + "source": [ + "docs = python_splitter.create_documents([python_text])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b1749579", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Foo:\\n\\n def bar():', lookup_str='', metadata={}, lookup_index=0),\n", + " Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n", + " Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "0905c1de", + "id": "6e6c8cc7", "metadata": {}, "outputs": [], "source": [] @@ -549,7 +701,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" }, "vscode": { "interpreter": { diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index eb818436..76f82f14 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter): """Split incoming text and return chunks.""" splits = (str(s) for s in self._tokenizer(text).sents) return self._merge_splits(splits, self._separator) + + +class MarkdownTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Markdown-formatted headings.""" + + def __init__(self, **kwargs: Any): + """Initialize a MarkdownTextSplitter.""" + separators = [ + # First, try to split along Markdown headings (starting with level 2) + "\n## ", + "\n### ", + "\n#### ", + "\n##### ", + "\n###### ", + # Note the alternative syntax for headings (below) is not handled here + # Heading level 2 + # --------------- + # End of code block + "```\n\n", + # Horizontal lines + "\n\n***\n\n", + "\n\n---\n\n", + "\n\n___\n\n", + # Note that this splitter doesn't handle horizontal lines defined + # by *three or more* of ***, ---, or ___, but this is not handled + "\n\n", + "\n", + " ", + "", + ] + super().__init__(separators=separators, **kwargs) + + +class PythonCodeTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Python syntax.""" + + def __init__(self, **kwargs: Any): + """Initialize a MarkdownTextSplitter.""" + separators = [ + # First, try to split along class definitions + "\nclass ", + "\ndef ", + "\n\tdef ", + # Now split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + super().__init__(separators=separators, **kwargs)