Harrison/markdown splitter (#1169)

Co-authored-by: Michael Chen <flamingdescent@gmail.com>
Co-authored-by: Michael Chen <michaelchen@stripe.com>
This commit is contained in:
Harrison Chase 2023-02-19 21:31:58 -08:00 committed by GitHub
parent 37dd34bea5
commit 28781a6213
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 204 additions and 2 deletions

View File

@ -524,10 +524,162 @@
"print(texts[0])"
]
},
{
"cell_type": "markdown",
"id": "c24dbbb7",
"metadata": {},
"source": [
"# Markdown Text Splitter\n",
"\n",
"MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "593e490c",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import MarkdownTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "89a9a3ea",
"metadata": {},
"outputs": [],
"source": [
"markdown_text = \"\"\"\n",
"# 🦜️🔗 LangChain\n",
"\n",
"⚡ Building applications with LLMs through composability ⚡\n",
"\n",
"## Quick Install\n",
"\n",
"```bash\n",
"# Hopefully this code block isn't split\n",
"pip install langchain\n",
"```\n",
"\n",
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
"\"\"\"\n",
"markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "241f0719",
"metadata": {},
"outputs": [],
"source": [
"docs = markdown_splitter.create_documents([markdown_text])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7789e643",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
"cell_type": "markdown",
"id": "04a6392a",
"metadata": {},
"source": [
"# Python Code Text Splitter\n",
"\n",
"PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "8fb36bc7",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import PythonCodeTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d359f3dc",
"metadata": {},
"outputs": [],
"source": [
"python_text = \"\"\"\n",
"class Foo:\n",
"\n",
" def bar():\n",
" \n",
" \n",
"def foo():\n",
"\n",
"def testing_func():\n",
"\n",
"def bar():\n",
"\"\"\"\n",
"python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "26b79cd9",
"metadata": {},
"outputs": [],
"source": [
"docs = python_splitter.create_documents([python_text])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b1749579",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Foo:\\n\\n def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
" Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0905c1de",
"id": "6e6c8cc7",
"metadata": {},
"outputs": [],
"source": []
@ -549,7 +701,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

View File

@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
"""Split incoming text and return chunks."""
splits = (str(s) for s in self._tokenizer(text).sents)
return self._merge_splits(splits, self._separator)
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
"""Attempts to split the text along Markdown-formatted headings."""
def __init__(self, **kwargs: Any):
"""Initialize a MarkdownTextSplitter."""
separators = [
# First, try to split along Markdown headings (starting with level 2)
"\n## ",
"\n### ",
"\n#### ",
"\n##### ",
"\n###### ",
# Note the alternative syntax for headings (below) is not handled here
# Heading level 2
# ---------------
# End of code block
"```\n\n",
# Horizontal lines
"\n\n***\n\n",
"\n\n---\n\n",
"\n\n___\n\n",
# Note that this splitter doesn't handle horizontal lines defined
# by *three or more* of ***, ---, or ___, but this is not handled
"\n\n",
"\n",
" ",
"",
]
super().__init__(separators=separators, **kwargs)
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
"""Attempts to split the text along Python syntax."""
def __init__(self, **kwargs: Any):
"""Initialize a MarkdownTextSplitter."""
separators = [
# First, try to split along class definitions
"\nclass ",
"\ndef ",
"\n\tdef ",
# Now split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
super().__init__(separators=separators, **kwargs)