From 5ce74b59586d64dd145465d6499053b997eac0d9 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 31 May 2023 07:11:53 -0700 Subject: [PATCH] code splitter docs (#5480) Co-authored-by: Dev 2049 --- docs/modules/indexes/text_splitters.rst | 8 +- .../examples/code_splitter.ipynb | 337 +++++++++++++++--- .../text_splitters/examples/html.ipynb | 172 --------- .../text_splitters/examples/latex.ipynb | 155 -------- .../text_splitters/examples/markdown.ipynb | 153 -------- .../text_splitters/examples/python.ipynb | 121 ------- langchain/text_splitter.py | 332 +++++++---------- tests/unit_tests/test_text_splitter.py | 49 ++- 8 files changed, 455 insertions(+), 872 deletions(-) delete mode 100644 docs/modules/indexes/text_splitters/examples/html.ipynb delete mode 100644 docs/modules/indexes/text_splitters/examples/latex.ipynb delete mode 100644 docs/modules/indexes/text_splitters/examples/markdown.ipynb delete mode 100644 docs/modules/indexes/text_splitters/examples/python.ipynb diff --git a/docs/modules/indexes/text_splitters.rst b/docs/modules/indexes/text_splitters.rst index 9b8b66fb..a8e037a3 100644 --- a/docs/modules/indexes/text_splitters.rst +++ b/docs/modules/indexes/text_splitters.rst @@ -33,10 +33,8 @@ For an introduction to the default text splitter and generic functionality see: Usage examples for the text splitters: - `Character <./text_splitters/examples/character_text_splitter.html>`_ -- `LaTeX <./text_splitters/examples/latex.html>`_ -- `Markdown <./text_splitters/examples/markdown.html>`_ +- `Code (including HTML, Markdown, Latex, Python, etc) <./text_splitters/examples/code_splitter.html>`_ - `NLTK <./text_splitters/examples/nltk.html>`_ -- `Python code <./text_splitters/examples/python.html>`_ - `Recursive Character <./text_splitters/examples/recursive_text_splitter.html>`_ - `spaCy <./text_splitters/examples/spacy.html>`_ - `tiktoken (OpenAI) <./text_splitters/examples/tiktoken_splitter.html>`_ @@ -49,10 +47,8 @@ Usage examples for the text splitters: :hidden: ./text_splitters/examples/character_text_splitter.ipynb - ./text_splitters/examples/latex.ipynb - ./text_splitters/examples/markdown.ipynb + ./text_splitters/examples/code_splitter.ipynb ./text_splitters/examples/nltk.ipynb - ./text_splitters/examples/python.ipynb ./text_splitters/examples/recursive_text_splitter.ipynb ./text_splitters/examples/spacy.ipynb ./text_splitters/examples/tiktoken_splitter.ipynb diff --git a/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb b/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb index c769dd4a..674159f6 100644 --- a/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb +++ b/docs/modules/indexes/text_splitters/examples/code_splitter.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ @@ -12,64 +11,94 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from langchain.text_splitter import (\n", - " CodeTextSplitter,\n", + " RecursiveCharacterTextSplitter,\n", " Language,\n", ")" ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 2, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cpp',\n", + " 'go',\n", + " 'java',\n", + " 'js',\n", + " 'php',\n", + " 'proto',\n", + " 'python',\n", + " 'rst',\n", + " 'ruby',\n", + " 'rust',\n", + " 'scala',\n", + " 'swift',\n", + " 'markdown',\n", + " 'latex',\n", + " 'html']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Choose a language to use" + "# Full list of support languages\n", + "[e.value for e in Language]" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "python_splitter = CodeTextSplitter(\n", - " language=Language.PYTHON, chunk_size=16, chunk_overlap=0\n", - ")\n", - "js_splitter = CodeTextSplitter(\n", - " language=Language.JS, chunk_size=16, chunk_overlap=0\n", - ")" + "# You can also see the separators used for a given language\n", + "RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Split the code" + "## Python\n", + "\n", + "Here's an example using the PythonTextSplitter" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='def', metadata={}),\n", - " Document(page_content='hello_world():', metadata={}),\n", - " Document(page_content='print(\"Hello,', metadata={}),\n", - " Document(page_content='World!\")', metadata={}),\n", - " Document(page_content='# Call the', metadata={}),\n", - " Document(page_content='function', metadata={}),\n", - " Document(page_content='hello_world()', metadata={})]" + "[Document(page_content='def hello_world():\\n print(\"Hello, World!\")', metadata={}),\n", + " Document(page_content='# Call the function\\nhello_world()', metadata={})]" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -82,31 +111,34 @@ "# Call the function\n", "hello_world()\n", "\"\"\"\n", - "\n", + "python_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.PYTHON, chunk_size=50, chunk_overlap=0\n", + ")\n", "python_docs = python_splitter.create_documents([PYTHON_CODE])\n", "python_docs" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## JS\n", + "Here's an example using the JS text splitter" + ] + }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='function', metadata={}),\n", - " Document(page_content='helloWorld() {', metadata={}),\n", - " Document(page_content='console.log(\"He', metadata={}),\n", - " Document(page_content='llo,', metadata={}),\n", - " Document(page_content='World!\");', metadata={}),\n", - " Document(page_content='}', metadata={}),\n", - " Document(page_content='// Call the', metadata={}),\n", - " Document(page_content='function', metadata={}),\n", - " Document(page_content='helloWorld();', metadata={})]" + "[Document(page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}', metadata={}),\n", + " Document(page_content='// Call the function\\nhelloWorld();', metadata={})]" ] }, - "execution_count": 9, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -121,10 +153,234 @@ "helloWorld();\n", "\"\"\"\n", "\n", + "js_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.JS, chunk_size=60, chunk_overlap=0\n", + ")\n", "js_docs = js_splitter.create_documents([JS_CODE])\n", "js_docs" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Markdown\n", + "\n", + "Here's an example using the Markdown text splitter." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "markdown_text = \"\"\"\n", + "# šŸ¦œļøšŸ”— LangChain\n", + "\n", + "āš” Building applications with LLMs through composability āš”\n", + "\n", + "## Quick Install\n", + "\n", + "```bash\n", + "# Hopefully this code block isn't split\n", + "pip install langchain\n", + "```\n", + "\n", + "As an open source project in a rapidly developing field, we are extremely open to contributions.\n", + "\"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='# šŸ¦œļøšŸ”— LangChain', metadata={}),\n", + " Document(page_content='āš” Building applications with LLMs through composability āš”', metadata={}),\n", + " Document(page_content='## Quick Install', metadata={}),\n", + " Document(page_content=\"```bash\\n# Hopefully this code block isn't split\", metadata={}),\n", + " Document(page_content='pip install langchain', metadata={}),\n", + " Document(page_content='```', metadata={}),\n", + " Document(page_content='As an open source project in a rapidly developing field, we', metadata={}),\n", + " Document(page_content='are extremely open to contributions.', metadata={})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n", + ")\n", + "md_docs = md_splitter.create_documents([markdown_text])\n", + "md_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Latex\n", + "\n", + "Here's an example on Latex text" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "latex_text = \"\"\"\n", + "\\documentclass{article}\n", + "\n", + "\\begin{document}\n", + "\n", + "\\maketitle\n", + "\n", + "\\section{Introduction}\n", + "Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n", + "\n", + "\\subsection{History of LLMs}\n", + "The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n", + "\n", + "\\subsection{Applications of LLMs}\n", + "LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n", + "\n", + "\\end{document}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', metadata={}),\n", + " Document(page_content='\\\\section{Introduction}', metadata={}),\n", + " Document(page_content='Large language models (LLMs) are a type of machine learning', metadata={}),\n", + " Document(page_content='model that can be trained on vast amounts of text data to', metadata={}),\n", + " Document(page_content='generate human-like language. In recent years, LLMs have', metadata={}),\n", + " Document(page_content='made significant advances in a variety of natural language', metadata={}),\n", + " Document(page_content='processing tasks, including language translation, text', metadata={}),\n", + " Document(page_content='generation, and sentiment analysis.', metadata={}),\n", + " Document(page_content='\\\\subsection{History of LLMs}', metadata={}),\n", + " Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,', metadata={}),\n", + " Document(page_content='but they were limited by the amount of data that could be', metadata={}),\n", + " Document(page_content='processed and the computational power available at the', metadata={}),\n", + " Document(page_content='time. In the past decade, however, advances in hardware and', metadata={}),\n", + " Document(page_content='software have made it possible to train LLMs on massive', metadata={}),\n", + " Document(page_content='datasets, leading to significant improvements in', metadata={}),\n", + " Document(page_content='performance.', metadata={}),\n", + " Document(page_content='\\\\subsection{Applications of LLMs}', metadata={}),\n", + " Document(page_content='LLMs have many applications in industry, including', metadata={}),\n", + " Document(page_content='chatbots, content creation, and virtual assistants. They', metadata={}),\n", + " Document(page_content='can also be used in academia for research in linguistics,', metadata={}),\n", + " Document(page_content='psychology, and computational linguistics.', metadata={}),\n", + " Document(page_content='\\\\end{document}', metadata={})]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "latex_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n", + ")\n", + "latex_docs = latex_splitter.create_documents([latex_text])\n", + "latex_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## HTML\n", + "\n", + "Here's an example using an HTML text splitter" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "html_text = \"\"\"\n", + "\n", + "\n", + " \n", + " šŸ¦œļøšŸ”— LangChain\n", + " \n", + " \n", + " \n", + "
\n", + "

šŸ¦œļøšŸ”— LangChain

\n", + "

āš” Building applications with LLMs through composability āš”

\n", + "
\n", + "
\n", + " As an open source project in a rapidly developing field, we are extremely open to contributions.\n", + "
\n", + " \n", + "\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='\\n\\n ', metadata={}),\n", + " Document(page_content='šŸ¦œļøšŸ”— LangChain\\n \\n \\n \\n
', metadata={}),\n", + " Document(page_content='

šŸ¦œļøšŸ”— LangChain

', metadata={}),\n", + " Document(page_content='

āš” Building applications with LLMs through', metadata={}),\n", + " Document(page_content='composability āš”

', metadata={}),\n", + " Document(page_content='
\\n
', metadata={}),\n", + " Document(page_content='As an open source project in a rapidly', metadata={}),\n", + " Document(page_content='developing field, we are extremely open to contributions.', metadata={}),\n", + " Document(page_content='
\\n \\n', metadata={})]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "html_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0\n", + ")\n", + "html_docs = html_splitter.create_documents([html_text])\n", + "html_docs" + ] + }, { "cell_type": "code", "execution_count": null, @@ -135,7 +391,7 @@ ], "metadata": { "kernelspec": { - "display_name": "langchain", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -149,9 +405,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "orig_nbformat": 4 + "version": "3.9.1" + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/docs/modules/indexes/text_splitters/examples/html.ipynb b/docs/modules/indexes/text_splitters/examples/html.ipynb deleted file mode 100644 index 53905136..00000000 --- a/docs/modules/indexes/text_splitters/examples/html.ipynb +++ /dev/null @@ -1,172 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "80f6cd99", - "metadata": {}, - "source": [ - "# HTML\n", - "\n", - ">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n", - "\n", - "`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n", - "\n", - "1. How the text is split: by list of `HTML` specific separators\n", - "2. How the chunk size is measured: by number of characters" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "96d64839", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.text_splitter import HtmlTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "cfb0da17", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "html_text = \"\"\"\n", - "\n", - "\n", - " \n", - " šŸ¦œļøšŸ”— LangChain\n", - " \n", - " \n", - " \n", - "
\n", - "

šŸ¦œļøšŸ”— LangChain

\n", - "

āš” Building applications with LLMs through composability āš”

\n", - "
\n", - "
\n", - " As an open source project in a rapidly developing field, we are extremely open to contributions.\n", - "
\n", - " \n", - "\n", - "\"\"\"\n", - "\n", - "html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d59a4fe8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "docs = html_splitter.create_documents([html_text])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "cbb2e100", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='\\n', metadata={}),\n", - " Document(page_content='šŸ¦œļøšŸ”— LangChain', metadata={}),\n", - " Document(page_content='body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n \\n ', metadata={}),\n", - " Document(page_content='/style>\\n ', metadata={}),\n", - " Document(page_content='
\\n

šŸ¦œļøšŸ”— LangChain

\\n

āš” Building applications with LLMs through composability āš”

\\n
', metadata={}),\n", - " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n \\n \\n', metadata={})]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['\\n',\n", - " 'šŸ¦œļøšŸ”— LangChain',\n", - " 'body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n \\n ',\n", - " '/style>\\n ',\n", - " '
\\n

šŸ¦œļøšŸ”— LangChain

\\n

āš” Building applications with LLMs through composability āš”

\\n
',\n", - " 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n \\n \\n']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "html_splitter.split_text(html_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bee7858-9175-4d99-bd30-68f2dece8601", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - }, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/modules/indexes/text_splitters/examples/latex.ipynb b/docs/modules/indexes/text_splitters/examples/latex.ipynb deleted file mode 100644 index aaf1f575..00000000 --- a/docs/modules/indexes/text_splitters/examples/latex.ipynb +++ /dev/null @@ -1,155 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "3a2f572e", - "metadata": {}, - "source": [ - "# LaTeX\n", - "\n", - ">[LaTeX](https://en.wikipedia.org/wiki/LaTeX) is widely used in academia for the communication and publication of scientific documents in many fields, including mathematics, computer science, engineering, physics, chemistry, economics, linguistics, quantitative psychology, philosophy, and political science.\n", - "\n", - "`LatexTextSplitter` splits text along `LaTeX` headings, headlines, enumerations and more. It's implemented as a subclass of `RecursiveCharacterSplitter` with LaTeX-specific separators. See the source code for more details.\n", - "\n", - "1. How the text is split: by list of `LaTeX` specific tags\n", - "2. How the chunk size is measured: by number of characters" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "c2503917", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.text_splitter import LatexTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e46b753b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "latex_text = \"\"\"\n", - "\\documentclass{article}\n", - "\n", - "\\begin{document}\n", - "\n", - "\\maketitle\n", - "\n", - "\\section{Introduction}\n", - "Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.\n", - "\n", - "\\subsection{History of LLMs}\n", - "The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.\n", - "\n", - "\\subsection{Applications of LLMs}\n", - "LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\n", - "\n", - "\\end{document}\n", - "\"\"\"\n", - "latex_splitter = LatexTextSplitter(chunk_size=400, chunk_overlap=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "73b5bd33", - "metadata": {}, - "outputs": [], - "source": [ - "docs = latex_splitter.create_documents([latex_text])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e1c7fbd5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle', lookup_str='', metadata={}, lookup_index=0),\n", - " Document(page_content='Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.', lookup_str='', metadata={}, lookup_index=0),\n", - " Document(page_content='History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.', lookup_str='', metadata={}, lookup_index=0),\n", - " Document(page_content='Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}', lookup_str='', metadata={}, lookup_index=0)]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "40e62829-9485-414e-9ea1-e1a8fc7c88cb", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle',\n", - " 'Introduction}\\nLarge language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.',\n", - " 'History of LLMs}\\nThe earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.',\n", - " 'Applications of LLMs}\\nLLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.\\n\\n\\\\end{document}']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "latex_splitter.split_text(latex_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7deb8f25-a062-4956-9f90-513802069667", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/modules/indexes/text_splitters/examples/markdown.ipynb b/docs/modules/indexes/text_splitters/examples/markdown.ipynb deleted file mode 100644 index 1c784e8b..00000000 --- a/docs/modules/indexes/text_splitters/examples/markdown.ipynb +++ /dev/null @@ -1,153 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "80f6cd99", - "metadata": {}, - "source": [ - "# Markdown\n", - "\n", - ">[Markdown](https://en.wikipedia.org/wiki/Markdown) is a lightweight markup language for creating formatted text using a plain-text editor.\n", - "\n", - "`MarkdownTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Markdown-specific separators. See the source code to see the Markdown syntax expected by default.\n", - "\n", - "1. How the text is split: by list of `markdown` specific separators\n", - "2. How the chunk size is measured: by number of characters" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "96d64839", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.text_splitter import MarkdownTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cfb0da17", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "markdown_text = \"\"\"\n", - "# šŸ¦œļøšŸ”— LangChain\n", - "\n", - "āš” Building applications with LLMs through composability āš”\n", - "\n", - "## Quick Install\n", - "\n", - "```bash\n", - "# Hopefully this code block isn't split\n", - "pip install langchain\n", - "```\n", - "\n", - "As an open source project in a rapidly developing field, we are extremely open to contributions.\n", - "\"\"\"\n", - "markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d59a4fe8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "docs = markdown_splitter.create_documents([markdown_text])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "cbb2e100", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='# šŸ¦œļøšŸ”— LangChain\\n\\nāš” Building applications with LLMs through composability āš”', metadata={}),\n", - " Document(page_content=\"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\", metadata={}),\n", - " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.', metadata={})]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['# šŸ¦œļøšŸ”— LangChain\\n\\nāš” Building applications with LLMs through composability āš”',\n", - " \"Quick Install\\n\\n```bash\\n# Hopefully this code block isn't split\\npip install langchain\",\n", - " 'As an open source project in a rapidly developing field, we are extremely open to contributions.']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "markdown_splitter.split_text(markdown_text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9bee7858-9175-4d99-bd30-68f2dece8601", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - }, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/modules/indexes/text_splitters/examples/python.ipynb b/docs/modules/indexes/text_splitters/examples/python.ipynb deleted file mode 100644 index ae48b7f1..00000000 --- a/docs/modules/indexes/text_splitters/examples/python.ipynb +++ /dev/null @@ -1,121 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "c350765d", - "metadata": {}, - "source": [ - "# Python Code\n", - "\n", - "`PythonCodeTextSplitter` splits text along python class and method definitions. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with Python-specific separators. See the source code to see the Python syntax expected by default.\n", - "\n", - "1. How the text is split: by list of python specific separators\n", - "2. How the chunk size is measured: by number of characters" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1703463f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.text_splitter import PythonCodeTextSplitter" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f17a1854", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "python_text = \"\"\"\n", - "class Foo:\n", - "\n", - " def bar():\n", - " \n", - " \n", - "def foo():\n", - "\n", - "def testing_func_with_long_name():\n", - "\n", - "def bar():\n", - "\"\"\"\n", - "python_splitter = PythonCodeTextSplitter(chunk_size=40, chunk_overlap=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8cc33770", - "metadata": {}, - "outputs": [], - "source": [ - "docs = python_splitter.create_documents([python_text])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f5f70775", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Document(page_content='class Foo:\\n\\n def bar():', metadata={}),\n", - " Document(page_content='def foo():', metadata={}),\n", - " Document(page_content='def testing_func_with_long_name():', metadata={}),\n", - " Document(page_content='def bar():', metadata={})]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e096d42", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - }, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 54ee22b8..a88acf33 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -293,6 +293,24 @@ class TokenTextSplitter(TextSplitter): return splits +class Language(str, Enum): + CPP = "cpp" + GO = "go" + JAVA = "java" + JS = "js" + PHP = "php" + PROTO = "proto" + PYTHON = "python" + RST = "rst" + RUBY = "ruby" + RUST = "rust" + SCALA = "scala" + SWIFT = "swift" + MARKDOWN = "markdown" + LATEX = "latex" + HTML = "html" + + class RecursiveCharacterTextSplitter(TextSplitter): """Implementation of splitting text that looks at characters. @@ -350,205 +368,15 @@ class RecursiveCharacterTextSplitter(TextSplitter): def split_text(self, text: str) -> List[str]: return self._split_text(text, self._separators) - -class NLTKTextSplitter(TextSplitter): - """Implementation of splitting text that looks at sentences using NLTK.""" - - def __init__(self, separator: str = "\n\n", **kwargs: Any): - """Initialize the NLTK splitter.""" - super().__init__(**kwargs) - try: - from nltk.tokenize import sent_tokenize - - self._tokenizer = sent_tokenize - except ImportError: - raise ImportError( - "NLTK is not installed, please install it with `pip install nltk`." - ) - self._separator = separator - - def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - # First we naively split the large input into a bunch of smaller ones. - splits = self._tokenizer(text) - return self._merge_splits(splits, self._separator) - - -class SpacyTextSplitter(TextSplitter): - """Implementation of splitting text that looks at sentences using Spacy.""" - - def __init__( - self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", **kwargs: Any - ): - """Initialize the spacy text splitter.""" - super().__init__(**kwargs) - try: - import spacy - except ImportError: - raise ImportError( - "Spacy is not installed, please install it with `pip install spacy`." - ) - self._tokenizer = spacy.load(pipeline) - self._separator = separator - - def split_text(self, text: str) -> List[str]: - """Split incoming text and return chunks.""" - splits = (str(s) for s in self._tokenizer(text).sents) - return self._merge_splits(splits, self._separator) - - -class MarkdownTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along Markdown-formatted headings.""" - - def __init__(self, **kwargs: Any): - """Initialize a MarkdownTextSplitter.""" - separators = [ - # First, try to split along Markdown headings (starting with level 2) - "\n## ", - "\n### ", - "\n#### ", - "\n##### ", - "\n###### ", - # Note the alternative syntax for headings (below) is not handled here - # Heading level 2 - # --------------- - # End of code block - "```\n\n", - # Horizontal lines - "\n\n***\n\n", - "\n\n---\n\n", - "\n\n___\n\n", - # Note that this splitter doesn't handle horizontal lines defined - # by *three or more* of ***, ---, or ___, but this is not handled - "\n\n", - "\n", - " ", - "", - ] - super().__init__(separators=separators, **kwargs) - - -class LatexTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along Latex-formatted layout elements.""" - - def __init__(self, **kwargs: Any): - """Initialize a LatexTextSplitter.""" - separators = [ - # First, try to split along Latex sections - "\n\\chapter{", - "\n\\section{", - "\n\\subsection{", - "\n\\subsubsection{", - # Now split by environments - "\n\\begin{enumerate}", - "\n\\begin{itemize}", - "\n\\begin{description}", - "\n\\begin{list}", - "\n\\begin{quote}", - "\n\\begin{quotation}", - "\n\\begin{verse}", - "\n\\begin{verbatim}", - ## Now split by math environments - "\n\\begin{align}", - "$$", - "$", - # Now split by the normal type of lines - " ", - "", - ] - super().__init__(separators=separators, **kwargs) - - -class PythonCodeTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along Python syntax.""" - - def __init__(self, **kwargs: Any): - """Initialize a PythonCodeTextSplitter.""" - separators = [ - # First, try to split along class definitions - "\nclass ", - "\ndef ", - "\n\tdef ", - # Now split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - super().__init__(separators=separators, **kwargs) - - -class HtmlTextSplitter(RecursiveCharacterTextSplitter): - """Attempts to split the text along HTML layout elements.""" - - def __init__(self, **kwargs: Any): - """Initialize a HtmlTextSplitter.""" - separators = [ - # First, try to split along HTML tags - "", - "
", - "

", - "
", - "

  • ", - "

    ", - "

    ", - "

    ", - "

    ", - "

    ", - "
    ", - "", - "", - "", - "
    ", - "", - "
      ", - "
        ", - "
        ", - "