forked from Archives/langchain
parent
470b2822a3
commit
5ce74b5958
@ -1,172 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "80f6cd99",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# HTML\n",
|
|
||||||
"\n",
|
|
||||||
">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n",
|
|
||||||
"\n",
|
|
||||||
"`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n",
|
|
||||||
"\n",
|
|
||||||
"1. How the text is split: by list of `HTML` specific separators\n",
|
|
||||||
"2. How the chunk size is measured: by number of characters"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "96d64839",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.text_splitter import HtmlTextSplitter"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "cfb0da17",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"html_text = \"\"\"\n",
|
|
||||||
"<!DOCTYPE html>\n",
|
|
||||||
"<html>\n",
|
|
||||||
" <head>\n",
|
|
||||||
" <title>🦜️🔗 LangChain</title>\n",
|
|
||||||
" <style>\n",
|
|
||||||
" body {\n",
|
|
||||||
" font-family: Arial, sans-serif;\n",
|
|
||||||
" }\n",
|
|
||||||
" h1 {\n",
|
|
||||||
" color: darkblue;\n",
|
|
||||||
" }\n",
|
|
||||||
" </style>\n",
|
|
||||||
" </head>\n",
|
|
||||||
" <body>\n",
|
|
||||||
" <div>\n",
|
|
||||||
" <h1>🦜️🔗 LangChain</h1>\n",
|
|
||||||
" <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
|
|
||||||
" </div>\n",
|
|
||||||
" <div>\n",
|
|
||||||
" As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
|
||||||
" </div>\n",
|
|
||||||
" </body>\n",
|
|
||||||
"</html>\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"\n",
|
|
||||||
"html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"id": "d59a4fe8",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"docs = html_splitter.create_documents([html_text])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 14,
|
|
||||||
"id": "cbb2e100",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='<!DOCTYPE html>\\n<html>', metadata={}),\n",
|
|
||||||
" Document(page_content='<title>🦜️🔗 LangChain</title>', metadata={}),\n",
|
|
||||||
" Document(page_content='body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n </style>\\n </head>', metadata={}),\n",
|
|
||||||
" Document(page_content='/style>\\n </head>', metadata={}),\n",
|
|
||||||
" Document(page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>\\n <p>⚡ Building applications with LLMs through composability ⚡</p>\\n </div>', metadata={}),\n",
|
|
||||||
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n </div>\\n </body>\\n</html>', metadata={})]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 14,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"docs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 15,
|
|
||||||
"id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['<!DOCTYPE html>\\n<html>',\n",
|
|
||||||
" '<title>🦜️🔗 LangChain</title>',\n",
|
|
||||||
" 'body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n </style>\\n </head>',\n",
|
|
||||||
" '/style>\\n </head>',\n",
|
|
||||||
" '<div>\\n <h1>🦜️🔗 LangChain</h1>\\n <p>⚡ Building applications with LLMs through composability ⚡</p>\\n </div>',\n",
|
|
||||||
" 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n </div>\\n </body>\\n</html>']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 15,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"html_splitter.split_text(html_text)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "9bee7858-9175-4d99-bd30-68f2dece8601",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.10"
|
|
||||||
},
|
|
||||||
"vscode": {
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -1,155 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "3a2f572e",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# LaTeX\n",
|
|
||||||
"\n",
|
|
||||||
">[LaTeX](https://en.wikipedia.org/wiki/LaTeX) is widely used in academia for the communication and publication of scientific documents in many fields, including mathematics, computer science, engineering, physics, chemistry, economics, linguistics, quantitative psychology, philosophy, and political science.\n",
|
|
||||||
"\n",
|
|
||||||
"`LatexTextSplitter` splits text along `LaTeX` headings, headlines, enumerations and more. It's implemented as a subclass of `RecursiveCharacterSplitter` with LaTeX-specific separators. See the source code for more details.\n",
|
|
||||||
"\n",
|
|
||||||
"1. How the text is split: by list of `LaTeX` specific tags\n",
|
|
||||||
"2. How the chunk size is measured: by number of characters"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "c2503917",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.text_splitter import LatexTextSplitter"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "e46b753b",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"latex_text = \"\"\"\n",
|
|
||||||
"\\documentclass{article}\n",
|
|
||||||
"\n",
|
|
||||||
"\\begin{document}\n",
|
|
||||||
"\n",
|
|
||||||
"\\maketitle\n",
|
|
||||||
"\n",
|
|
||||||
"\\section{Introduction}\n",
|
|
||||||
"Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n",
|
|
||||||
"\n",
|
|
||||||
"\\subsection{History of LLMs}\n",
|
|
||||||
"The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n",
|
|
||||||
"\n",
|
|
||||||
"\\subsection{Applications of LLMs}\n",
|
|
||||||
"LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n",
|
|
||||||
"\n",
|
|
||||||
"\\end{document}\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"latex_splitter = LatexTextSplitter(chunk_size=400, chunk_overlap=0)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "73b5bd33",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"docs = latex_splitter.create_documents([latex_text])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "e1c7fbd5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', lookup_str='', metadata={}, lookup_index=0),\n",
|
|
||||||
" Document(page_content='Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.', lookup_str='', metadata={}, lookup_index=0),\n",
|
|
||||||
" Document(page_content='History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.', lookup_str='', metadata={}, lookup_index=0),\n",
|
|
||||||
" Document(page_content='Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}', lookup_str='', metadata={}, lookup_index=0)]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"docs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "40e62829-9485-414e-9ea1-e1a8fc7c88cb",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle',\n",
|
|
||||||
" 'Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.',\n",
|
|
||||||
" 'History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.',\n",
|
|
||||||
" 'Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"latex_splitter.split_text(latex_text)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "7deb8f25-a062-4956-9f90-513802069667",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.6"
|
|
||||||
},
|
|
||||||
"vscode": {
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -1,153 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "80f6cd99",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Markdown\n",
|
|
||||||
"\n",
|
|
||||||
">[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n",
|
|
||||||
"\n",
|
|
||||||
"`MarkdownTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n",
|
|
||||||
"\n",
|
|
||||||
"1. How the text is split: by list of `markdown` specific separators\n",
|
|
||||||
"2. How the chunk size is measured: by number of characters"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "96d64839",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.text_splitter import MarkdownTextSplitter"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "cfb0da17",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"markdown_text = \"\"\"\n",
|
|
||||||
"# 🦜️🔗 LangChain\n",
|
|
||||||
"\n",
|
|
||||||
"⚡ Building applications with LLMs through composability ⚡\n",
|
|
||||||
"\n",
|
|
||||||
"## Quick Install\n",
|
|
||||||
"\n",
|
|
||||||
"```bash\n",
|
|
||||||
"# Hopefully this code block isn't split\n",
|
|
||||||
"pip install langchain\n",
|
|
||||||
"```\n",
|
|
||||||
"\n",
|
|
||||||
"As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "d59a4fe8",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"docs = markdown_splitter.create_documents([markdown_text])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "cbb2e100",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡', metadata={}),\n",
|
|
||||||
" Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", metadata={}),\n",
|
|
||||||
" Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', metadata={})]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"docs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['# 🦜️🔗 LangChain\\n\\n⚡ Building applications with LLMs through composability ⚡',\n",
|
|
||||||
" \"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\",\n",
|
|
||||||
" 'As an open source project in a rapidly developing field, we are extremely open to contributions.']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"markdown_splitter.split_text(markdown_text)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "9bee7858-9175-4d99-bd30-68f2dece8601",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.6"
|
|
||||||
},
|
|
||||||
"vscode": {
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -1,121 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "c350765d",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Python Code\n",
|
|
||||||
"\n",
|
|
||||||
"`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.\n",
|
|
||||||
"\n",
|
|
||||||
"1. How the text is split: by list of python specific separators\n",
|
|
||||||
"2. How the chunk size is measured: by number of characters"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "1703463f",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from langchain.text_splitter import PythonCodeTextSplitter"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "f17a1854",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"python_text = \"\"\"\n",
|
|
||||||
"class Foo:\n",
|
|
||||||
"\n",
|
|
||||||
" def bar():\n",
|
|
||||||
" \n",
|
|
||||||
" \n",
|
|
||||||
"def foo():\n",
|
|
||||||
"\n",
|
|
||||||
"def testing_func_with_long_name():\n",
|
|
||||||
"\n",
|
|
||||||
"def bar():\n",
|
|
||||||
"\"\"\"\n",
|
|
||||||
"python_splitter = PythonCodeTextSplitter(chunk_size=40, chunk_overlap=0)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "8cc33770",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"docs = python_splitter.create_documents([python_text])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "f5f70775",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[Document(page_content='class Foo:\\n\\n def bar():', metadata={}),\n",
|
|
||||||
" Document(page_content='def foo():', metadata={}),\n",
|
|
||||||
" Document(page_content='def testing_func_with_long_name():', metadata={}),\n",
|
|
||||||
" Document(page_content='def bar():', metadata={})]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"docs"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "6e096d42",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.9.1"
|
|
||||||
},
|
|
||||||
"vscode": {
|
|
||||||
"interpreter": {
|
|
||||||
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
Loading…
Reference in New Issue