mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
c998569c8f
#docs: text splitters improvements Changes are only in the Jupyter notebooks. - added links to the source packages and a short description of these packages - removed " Text Splitters" suffixes from the TOC elements (they made the list of the text splitters messy) - moved text splitters, based on the length function into a separate list. They can be mixed with any classes from the "Text Splitters", so it is a different classification. ## Who can review? @hwchase17 - project lead @eyurtsev @vowelparrot NOTE: please, check out the results of the `Python code` text splitter example (text_splitters/examples/python.ipynb). It looks suboptimal.
141 lines
3.6 KiB
Plaintext
141 lines
3.6 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "072eee66",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Recursive Character\n",
|
|
"\n",
|
|
"This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is `[\"\\n\\n\", \"\\n\", \" \", \"\"]`. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.\n",
|
|
"\n",
|
|
"\n",
|
|
"1. How the text is split: by list of characters\n",
|
|
"2. How the chunk size is measured: by number of characters"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d887c134",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# This is a long document we can split up.\n",
|
|
"with open('../../../state_of_the_union.txt') as f:\n",
|
|
" state_of_the_union = f.read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "14662639",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.text_splitter import RecursiveCharacterTextSplitter"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "fc6e42c8",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"text_splitter = RecursiveCharacterTextSplitter(\n",
|
|
" # Set a really small chunk size, just to show.\n",
|
|
" chunk_size = 100,\n",
|
|
" chunk_overlap = 20,\n",
|
|
" length_function = len,\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "bd1a0a15",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' lookup_str='' metadata={} lookup_index=0\n",
|
|
"page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.' lookup_str='' metadata={} lookup_index=0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"texts = text_splitter.create_documents([state_of_the_union])\n",
|
|
"print(texts[0])\n",
|
|
"print(texts[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "ca35212d-634c-4679-9042-19c294a3c815",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and',\n",
|
|
" 'of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.']"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"text_splitter.split_text(state_of_the_union)[:2]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b019a56a-7ba5-479d-b696-32188e4bc433",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
},
|
|
"vscode": {
|
|
"interpreter": {
|
|
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
|
|
}
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|