diff --git a/docs/extras/modules/data_connection/document_transformers/text_splitters/split_by_token.ipynb b/docs/extras/modules/data_connection/document_transformers/text_splitters/split_by_token.ipynb index 1a99e3c417..dfa81c3e28 100644 --- a/docs/extras/modules/data_connection/document_transformers/text_splitters/split_by_token.ipynb +++ b/docs/extras/modules/data_connection/document_transformers/text_splitters/split_by_token.ipynb @@ -91,7 +91,9 @@ "id": "de5b6a6e", "metadata": {}, "source": [ - "We can also load a tiktoken splitter directly" + "Note that if we use `CharacterTextSplitter.from_tiktoken_encoder`, text is only split by `CharacterTextSplitter` and `tiktoken` tokenizer is used to merge splits. It means that split can be larger than chunk size measured by `tiktoken` tokenizer. We can use `RecursiveCharacterTextSplitter.from_tiktoken_encoder` to make sure splits are not larger than chunk size of tokens allowed by the language model, where each split will be recursively split if it has a larger size.\n", + "\n", + "We can also load a tiktoken splitter directly, which ensure each split is smaller than chunk size." ] }, {