mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Harrison/markdown splitter (#1169)
Co-authored-by: Michael Chen <flamingdescent@gmail.com> Co-authored-by: Michael Chen <michaelchen@stripe.com>
This commit is contained in:
parent
37dd34bea5
commit
28781a6213
@ -524,10 +524,162 @@
|
|||||||
"print(texts[0])"
|
"print(texts[0])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c24dbbb7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Markdown Text Splitter\n",
|
||||||
|
"\n",
|
||||||
|
"MarkdownTextSplitter splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators. See the source code to see the Markdown syntax expected by default."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "593e490c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.text_splitter import MarkdownTextSplitter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "89a9a3ea",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"markdown_text = \"\"\"\n",
|
||||||
|
"# 🦜️🔗 LangChain\n",
|
||||||
|
"\n",
|
||||||
|
"⚡ Building applications with LLMs through composability ⚡\n",
|
||||||
|
"\n",
|
||||||
|
"## Quick Install\n",
|
||||||
|
"\n",
|
||||||
|
"```bash\n",
|
||||||
|
"# Hopefully this code block isn't split\n",
|
||||||
|
"pip install langchain\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "241f0719",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = markdown_splitter.create_documents([markdown_text])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "7789e643",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||||
|
" Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", lookup_str='', metadata={}, lookup_index=0),\n",
|
||||||
|
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', lookup_str='', metadata={}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "04a6392a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Python Code Text Splitter\n",
|
||||||
|
"\n",
|
||||||
|
"PythonCodeTextSplitter splits text along python class and method definitions. It's implemented as a simple subclass of RecursiveCharacterSplitter with Python-specific separators. See the source code to see the Python syntax expected by default."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "8fb36bc7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.text_splitter import PythonCodeTextSplitter"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "d359f3dc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"python_text = \"\"\"\n",
|
||||||
|
"class Foo:\n",
|
||||||
|
"\n",
|
||||||
|
" def bar():\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
"def foo():\n",
|
||||||
|
"\n",
|
||||||
|
"def testing_func():\n",
|
||||||
|
"\n",
|
||||||
|
"def bar():\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"python_splitter = PythonCodeTextSplitter(chunk_size=30, chunk_overlap=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "26b79cd9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = python_splitter.create_documents([python_text])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "b1749579",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Foo:\\n\\n def bar():', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||||
|
" Document(page_content='foo():\\n\\ndef testing_func():', lookup_str='', metadata={}, lookup_index=0),\n",
|
||||||
|
" Document(page_content='bar():', lookup_str='', metadata={}, lookup_index=0)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "0905c1de",
|
"id": "6e6c8cc7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
@ -549,7 +701,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.9"
|
"version": "3.9.1"
|
||||||
},
|
},
|
||||||
"vscode": {
|
"vscode": {
|
||||||
"interpreter": {
|
"interpreter": {
|
||||||
|
@ -309,3 +309,53 @@ class SpacyTextSplitter(TextSplitter):
|
|||||||
"""Split incoming text and return chunks."""
|
"""Split incoming text and return chunks."""
|
||||||
splits = (str(s) for s in self._tokenizer(text).sents)
|
splits = (str(s) for s in self._tokenizer(text).sents)
|
||||||
return self._merge_splits(splits, self._separator)
|
return self._merge_splits(splits, self._separator)
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
|
"""Attempts to split the text along Markdown-formatted headings."""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Any):
|
||||||
|
"""Initialize a MarkdownTextSplitter."""
|
||||||
|
separators = [
|
||||||
|
# First, try to split along Markdown headings (starting with level 2)
|
||||||
|
"\n## ",
|
||||||
|
"\n### ",
|
||||||
|
"\n#### ",
|
||||||
|
"\n##### ",
|
||||||
|
"\n###### ",
|
||||||
|
# Note the alternative syntax for headings (below) is not handled here
|
||||||
|
# Heading level 2
|
||||||
|
# ---------------
|
||||||
|
# End of code block
|
||||||
|
"```\n\n",
|
||||||
|
# Horizontal lines
|
||||||
|
"\n\n***\n\n",
|
||||||
|
"\n\n---\n\n",
|
||||||
|
"\n\n___\n\n",
|
||||||
|
# Note that this splitter doesn't handle horizontal lines defined
|
||||||
|
# by *three or more* of ***, ---, or ___, but this is not handled
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
super().__init__(separators=separators, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
|
"""Attempts to split the text along Python syntax."""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs: Any):
|
||||||
|
"""Initialize a MarkdownTextSplitter."""
|
||||||
|
separators = [
|
||||||
|
# First, try to split along class definitions
|
||||||
|
"\nclass ",
|
||||||
|
"\ndef ",
|
||||||
|
"\n\tdef ",
|
||||||
|
# Now split by the normal type of lines
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
super().__init__(separators=separators, **kwargs)
|
||||||
|
Loading…
Reference in New Issue
Block a user