Harrison/docs splitter (#879)

This commit is contained in:
Harrison Chase 2023-02-03 15:09:13 -08:00 committed by GitHub
parent bcfbc7a818
commit 0b9f086d36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,7 +1,6 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "b118c9dc", "id": "b118c9dc",
"metadata": {}, "metadata": {},
@ -476,10 +475,59 @@
"print(texts[0])" "print(texts[0])"
] ]
}, },
{
"cell_type": "markdown",
"id": "53049ff5",
"metadata": {},
"source": [
"## Token Text Splitter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a1a118b1",
"metadata": {},
"outputs": [],
"source": [
"from langchain.text_splitter import TokenTextSplitter"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ef37c5d3",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5750228a",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Madam Speaker, Madam Vice President, our\n"
]
}
],
"source": [
"texts = text_splitter.split_text(state_of_the_union)\n",
"print(texts[0])"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "a1a118b1", "id": "0905c1de",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []
@ -487,7 +535,7 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3 (ipykernel)",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
@ -501,7 +549,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.12 (main, Mar 26 2022, 15:51:15) \n[Clang 13.1.6 (clang-1316.0.21.2)]" "version": "3.10.9"
}, },
"vscode": { "vscode": {
"interpreter": { "interpreter": {