From f72bb966f894f99c9ffc2c730be392c71d020ac8 Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Tue, 30 May 2023 21:06:07 -0700
Subject: [PATCH] Harrison/html splitter (#5468)

Co-authored-by: David Revillas <26328973+r3v1@users.noreply.github.com>
---
 .../text_splitters/examples/html.ipynb        | 172 ++++++++++++++++++
 langchain/text_splitter.py                    |  39 ++++
 2 files changed, 211 insertions(+)
 create mode 100644 docs/modules/indexes/text_splitters/examples/html.ipynb
diff --git a/docs/modules/indexes/text_splitters/examples/html.ipynb b/docs/modules/indexes/text_splitters/examples/html.ipynb
new file mode 100644
index 00000000..53905136
--- /dev/null
+++ b/docs/modules/indexes/text_splitters/examples/html.ipynb
@@ -0,0 +1,172 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "80f6cd99",
+   "metadata": {},
+   "source": [
+    "# HTML\n",
+    "\n",
+    ">[HTML](https://en.wikipedia.org/wiki/HMTL) s the standard markup language for documents designed to be displayed in a web browser.\n",
+    "\n",
+    "`HtmlTextSplitter` splits text along Markdown headings, code blocks, or horizontal rules. It's implemented as a simple subclass of `RecursiveCharacterSplitter` with HTML-specific separators. See the source code to see the HTML syntax expected by default.\n",
+    "\n",
+    "1. How the text is split: by list of `HTML` specific separators\n",
+    "2. How the chunk size is measured: by number of characters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "96d64839",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import HtmlTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "cfb0da17",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "html_text = \"\"\"\n",
+    "<!DOCTYPE html>\n",
+    "<html>\n",
+    "    <head>\n",
+    "        <title>🦜️🔗 LangChain</title>\n",
+    "        <style>\n",
+    "            body {\n",
+    "                font-family: Arial, sans-serif;\n",
+    "            }\n",
+    "            h1 {\n",
+    "                color: darkblue;\n",
+    "            }\n",
+    "        </style>\n",
+    "    </head>\n",
+    "    <body>\n",
+    "        <div>\n",
+    "            <h1>🦜️🔗 LangChain</h1>\n",
+    "            <p>⚡ Building applications with LLMs through composability ⚡</p>\n",
+    "        </div>\n",
+    "        <div>\n",
+    "            As an open source project in a rapidly developing field, we are extremely open to contributions.\n",
+    "        </div>\n",
+    "    </body>\n",
+    "</html>\n",
+    "\"\"\"\n",
+    "\n",
+    "html_splitter = HtmlTextSplitter(chunk_size=175, chunk_overlap=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d59a4fe8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "docs = html_splitter.create_documents([html_text])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "cbb2e100",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='<!DOCTYPE html>\\n<html>', metadata={}),\n",
+       " Document(page_content='<title>🦜️🔗 LangChain</title>', metadata={}),\n",
+       " Document(page_content='body {\\n                font-family: Arial, sans-serif;\\n            }\\n            h1 {\\n                color: darkblue;\\n            }\\n        </style>\\n    </head>', metadata={}),\n",
+       " Document(page_content='/style>\\n    </head>', metadata={}),\n",
+       " Document(page_content='<div>\\n            <h1>🦜️🔗 LangChain</h1>\\n            <p>⚡ Building applications with LLMs through composability ⚡</p>\\n        </div>', metadata={}),\n",
+       " Document(page_content='As an open source project in a rapidly developing field, we are extremely open to contributions.\\n        </div>\\n    </body>\\n</html>', metadata={})]"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "91b56e7e-b285-4ca4-a786-149544e0e3c6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['<!DOCTYPE html>\\n<html>',\n",
+       " '<title>🦜️🔗 LangChain</title>',\n",
+       " 'body {\\n                font-family: Arial, sans-serif;\\n            }\\n            h1 {\\n                color: darkblue;\\n            }\\n        </style>\\n    </head>',\n",
+       " '/style>\\n    </head>',\n",
+       " '<div>\\n            <h1>🦜️🔗 LangChain</h1>\\n            <p>⚡ Building applications with LLMs through composability ⚡</p>\\n        </div>',\n",
+       " 'As an open source project in a rapidly developing field, we are extremely open to contributions.\\n        </div>\\n    </body>\\n</html>']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "html_splitter.split_text(html_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bee7858-9175-4d99-bd30-68f2dece8601",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py
index c5e1a843..54ee22b8 100644
--- a/langchain/text_splitter.py
+++ b/langchain/text_splitter.py
@@ -478,6 +478,45 @@ class PythonCodeTextSplitter(RecursiveCharacterTextSplitter):
         super().__init__(separators=separators, **kwargs)
 
 
+class HtmlTextSplitter(RecursiveCharacterTextSplitter):
+    """Attempts to split the text along HTML layout elements."""
+
+    def __init__(self, **kwargs: Any):
+        """Initialize a HtmlTextSplitter."""
+        separators = [
+            # First, try to split along HTML tags
+            "<body>",
+            "<div>",
+            "<p>",
+            "<br>",
+            "<li>",
+            "<h1>",
+            "<h2>",
+            "<h3>",
+            "<h4>",
+            "<h5>",
+            "<h6>",
+            "<span>",
+            "<table>",
+            "<tr>",
+            "<td>",
+            "<th>",
+            "<ul>",
+            "<ol>",
+            "<header>",
+            "<footer>",
+            "<nav>",
+            # Head
+            "<head>",
+            "<style>",
+            "<script>",
+            "<meta>",
+            "<title>",
+            "",
+        ]
+        super().__init__(separators=separators, **kwargs)
+
+
 class Language(str, Enum):
     CPP = "cpp"
     GO = "go"