From f72bb966f894f99c9ffc2c730be392c71d020ac8 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Tue, 30 May 2023 21:06:07 -0700 Subject: [PATCH] Harrison/html splitter (#5468) Co-authored-by: David Revillas <26328973+r3v1@users.noreply.github.com> --- .../text_splitters/examples/html.ipynb | 172 ++++++++++++++++++ langchain/text_splitter.py | 39 ++++ 2 files changed, 211 insertions(+) create mode 100644 docs/modules/indexes/text_splitters/examples/html.ipynb diff --git a/docs/modules/indexes/text_splitters/examples/html.ipynb b/docs/modules/indexes/text_splitters/examples/html.ipynb new file mode 100644 index 00000000..53905136 --- /dev/null +++ b/docs/modules/indexes/text_splitters/examples/html.ipynb @@ -0,0 +1,172 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "80f6cd99", + "metadata": {}, + "source": [ + "# HTML\n", + "\n", + ">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n", + "\n", + "`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n", + "\n", + "1. How the text is split: by list of `HTML` specific separators\n", + "2. How the chunk size is measured: by number of characters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96d64839", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.text_splitter import HtmlTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cfb0da17", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "html_text = \"\"\"\n", + "\n", + "\n", + " \n", + " 🦜️🔗 LangChain\n", + " \n", + " \n", + " \n", + "
\n", + "

🦜️🔗 LangChain

\n", + "

⚡ Building applications with LLMs through composability ⚡

\n", + "
\n", + "
\n", + " As an open source project in a rapidly developing field, we are extremely open to contributions.\n", + "
\n", + " \n", + "\n", + "\"\"\"\n", + "\n", + "html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d59a4fe8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = html_splitter.create_documents([html_text])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cbb2e100", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='\\n', metadata={}),\n", + " Document(page_content='🦜️🔗 LangChain', metadata={}),\n", + " Document(page_content='body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n \\n ', metadata={}),\n", + " Document(page_content='/style>\\n ', metadata={}),\n", + " Document(page_content='
\\n

🦜️🔗 LangChain

\\n

⚡ Building applications with LLMs through composability ⚡

\\n
', metadata={}),\n", + " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n \\n \\n', metadata={})]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['\\n',\n", + " '🦜️🔗 LangChain',\n", + " 'body {\\n font-family: Arial, sans-serif;\\n }\\n h1 {\\n color: darkblue;\\n }\\n \\n ',\n", + " '/style>\\n ',\n", + " '
\\n

🦜️🔗 LangChain

\\n

⚡ Building applications with LLMs through composability ⚡

\\n
',\n", + " 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n \\n \\n']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "html_splitter.split_text(html_text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9bee7858-9175-4d99-bd30-68f2dece8601", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index c5e1a843..54ee22b8 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -478,6 +478,45 @@ class PythonCodeTextSplitter(RecursiveCharacterTextSplitter): super().__init__(separators=separators, **kwargs) +class HtmlTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along HTML layout elements.""" + + def __init__(self, **kwargs: Any): + """Initialize a HtmlTextSplitter.""" + separators = [ + # First, try to split along HTML tags + "", + "
", + "

", + "
", + "

  • ", + "

    ", + "

    ", + "

    ", + "

    ", + "

    ", + "
    ", + "", + "", + "", + "
    ", + "", + "
      ", + "
        ", + "
        ", + "