From 4220188df44d1e115e4c00dda6ca988cedbe4f80 Mon Sep 17 00:00:00 2001 From: Sean Diacono Date: Wed, 8 May 2024 13:16:11 +0200 Subject: [PATCH 1/2] Update gpt-3.5 token limit in Chat_finetuning_data_prep.ipynb The token limit in Chat_finetuning_data_prep.ipynb is not up to date with the current context window limit. It should be 16,385 tokens as stated in https://platform.openai.com/docs/models/gpt-3-5-turbo --- examples/Chat_finetuning_data_prep.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/Chat_finetuning_data_prep.ipynb b/examples/Chat_finetuning_data_prep.ipynb index c5a3cb7b..d97df75e 100644 --- a/examples/Chat_finetuning_data_prep.ipynb +++ b/examples/Chat_finetuning_data_prep.ipynb @@ -207,7 +207,7 @@ "2. **Number of Messages Per Example**: Summarizes the distribution of the number of messages in each conversation, providing insight into dialogue complexity.\n", "3. **Total Tokens Per Example**: Calculates and summarizes the distribution of the total number of tokens in each conversation. Important for understanding fine-tuning costs.\n", "4. **Tokens in Assistant's Messages**: Calculates the number of tokens in the assistant's messages per conversation and summarizes this distribution. Useful for understanding the assistant's verbosity.\n", - "5. **Token Limit Warnings**: Checks if any examples exceed the maximum token limit (4096 tokens), as such examples will be truncated during fine-tuning, potentially resulting in data loss.\n" + "5. **Token Limit Warnings**: Checks if any examples exceed the maximum token limit (16,385 tokens), as such examples will be truncated during fine-tuning, potentially resulting in data loss.\n" ] }, { @@ -240,7 +240,7 @@ "mean / median: 1610.2, 10.0\n", "p5 / p95: 6.0, 4811.200000000001\n", "\n", - "1 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n" + "0 examples may be over the 16,385 token limit, they will be truncated during fine-tuning\n" ] } ], @@ -267,8 +267,8 @@ "print_distribution(n_messages, \"num_messages_per_example\")\n", "print_distribution(convo_lens, \"num_total_tokens_per_example\")\n", "print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n", - "n_too_long = sum(l > 4096 for l in convo_lens)\n", - "print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")" + "n_too_long = sum(l > 16,385 for l in convo_lens)\n", + "print(f\"\\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning\")" ] }, { From 797e19610fe1a7dfb3fdc817f4969eb6affe5550 Mon Sep 17 00:00:00 2001 From: Sean Diacono Date: Thu, 9 May 2024 13:39:31 +0200 Subject: [PATCH 2/2] Update the MAX_TOKENS_PER_EXAMPLE to correct limit --- examples/Chat_finetuning_data_prep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Chat_finetuning_data_prep.ipynb b/examples/Chat_finetuning_data_prep.ipynb index d97df75e..20779547 100644 --- a/examples/Chat_finetuning_data_prep.ipynb +++ b/examples/Chat_finetuning_data_prep.ipynb @@ -300,7 +300,7 @@ ], "source": [ "# Pricing and default n_epochs estimate\n", - "MAX_TOKENS_PER_EXAMPLE = 4096\n", + "MAX_TOKENS_PER_EXAMPLE = 16385\n", "\n", "TARGET_EPOCHS = 3\n", "MIN_TARGET_EXAMPLES = 100\n",