mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/markdown splitter (#1169)
Co-authored-by: Michael Chen <flamingdescent@gmail.com> Co-authored-by: Michael Chen <michaelchen@stripe.com>
This commit is contained in:
parent
37dd34bea5
commit
28781a6213
@ -524,10 +524,162 @@
|
||||
"print(texts[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c24dbbb7",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Markdown Text Splitter\n",
|
||||
"\n",
|
||||
"MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "593e490c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import MarkdownTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "89a9a3ea",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"markdown_text = \"\"\"\n",
|
||||
"# 🦜️🔗 LangChain\n",
|
||||
"\n",
|
||||
"⚡ Building applications with LLMs through composability ⚡\n",
|
||||
"\n",
|
||||
"## Quick Install\n",
|
||||
"\n",
|
||||
"```bash\n",
|
||||
"# Hopefully this code block isn't split\n",
|
||||
"pip install langchain\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||
"\"\"\"\n",
|
||||
"markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "241f0719",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = markdown_splitter.create_documents([markdown_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "7789e643",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "04a6392a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Python Code Text Splitter\n",
|
||||
"\n",
|
||||
"PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "8fb36bc7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.text_splitter import PythonCodeTextSplitter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "d359f3dc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"python_text = \"\"\"\n",
|
||||
"class Foo:\n",
|
||||
"\n",
|
||||
" def bar():\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"def foo():\n",
|
||||
"\n",
|
||||
"def testing_func():\n",
|
||||
"\n",
|
||||
"def bar():\n",
|
||||
"\"\"\"\n",
|
||||
"python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "26b79cd9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"docs = python_splitter.create_documents([python_text])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "b1749579",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='Foo:\\n\\n def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||
" Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0905c1de",
|
||||
"id": "6e6c8cc7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
@ -549,7 +701,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.9.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
|
@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
|
||||
"""Split incoming text and return chunks."""
|
||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
||||
return self._merge_splits(splits, self._separator)
|
||||
|
||||
|
||||
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Markdown-formatted headings."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a MarkdownTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along Markdown headings (starting with level 2)
|
||||
"\n## ",
|
||||
"\n### ",
|
||||
"\n#### ",
|
||||
"\n##### ",
|
||||
"\n###### ",
|
||||
# Note the alternative syntax for headings (below) is not handled here
|
||||
# Heading level 2
|
||||
# ---------------
|
||||
# End of code block
|
||||
"```\n\n",
|
||||
# Horizontal lines
|
||||
"\n\n***\n\n",
|
||||
"\n\n---\n\n",
|
||||
"\n\n___\n\n",
|
||||
# Note that this splitter doesn't handle horizontal lines defined
|
||||
# by *three or more* of ***, ---, or ___, but this is not handled
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
||||
|
||||
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||
"""Attempts to split the text along Python syntax."""
|
||||
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize a MarkdownTextSplitter."""
|
||||
separators = [
|
||||
# First, try to split along class definitions
|
||||
"\nclass ",
|
||||
"\ndef ",
|
||||
"\n\tdef ",
|
||||
# Now split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
super().__init__(separators=separators, **kwargs)
|
||||
|
Loading…
Reference in New Issue
Block a user