openai-cookbook/examples/Chat_finetuning_data_prep.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4e63973b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import tiktoken\n",
    "import numpy as np\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "748510f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = \"data/toy_chat_fine_tuning.jsonl\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c248ccd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load dataset\n",
    "with open(data_path, 'r', encoding='utf-8') as f:\n",
    "    dataset = [json.loads(line) for line in f]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "189ed16c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num examples: 5\n",
      "First example:\n",
      "{'role': 'system', 'content': 'You are a happy assistant that puts a positive spin on everything.'}\n",
      "{'role': 'user', 'content': 'I fell off my bike today.'}\n",
      "{'role': 'assistant', 'content': \"It's great that you're getting exercise outdoors!\"}\n"
     ]
    }
   ],
   "source": [
    "# Initial dataset stats\n",
    "print(\"Num examples:\", len(dataset))\n",
    "print(\"First example:\")\n",
    "for message in dataset[0][\"messages\"]:\n",
    "    print(message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d9f3ccbf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No errors found\n"
     ]
    }
   ],
   "source": [
    "# Format error checks\n",
    "format_errors = defaultdict(int)\n",
    "\n",
    "for ex in dataset:\n",
    "    if not isinstance(ex, dict):\n",
    "        format_errors[\"data_type\"] += 1\n",
    "        continue\n",
    "        \n",
    "    messages = ex.get(\"messages\", None)\n",
    "    if not messages:\n",
    "        format_errors[\"missing_messages_list\"] += 1\n",
    "        continue\n",
    "        \n",
    "    for message in messages:\n",
    "        if \"role\" not in message or \"content\" not in message:\n",
    "            format_errors[\"message_missing_key\"] += 1\n",
    "        \n",
    "        if any(k not in (\"role\", \"content\", \"name\") for k in message):\n",
    "            format_errors[\"message_unrecognized_key\"] += 1\n",
    "        \n",
    "        if message.get(\"role\", None) not in (\"system\", \"user\", \"assistant\"):\n",
    "            format_errors[\"unrecognized_role\"] += 1\n",
    "            \n",
    "        content = message.get(\"content\", None)\n",
    "        if not content or not isinstance(content, str):\n",
    "            format_errors[\"missing_content\"] += 1\n",
    "    \n",
    "    if not any(message.get(\"role\", None) == \"assistant\" for message in messages):\n",
    "        format_errors[\"example_missing_assistant_message\"] += 1\n",
    "\n",
    "if format_errors:\n",
    "    print(\"Found errors:\")\n",
    "    for k, v in format_errors.items():\n",
    "        print(f\"{k}: {v}\")\n",
    "else:\n",
    "    print(\"No errors found\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8f4b47b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Token counting functions\n",
    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
    "\n",
    "# not exact!\n",
    "# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb\n",
    "def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):\n",
    "    num_tokens = 0\n",
    "    for message in messages:\n",
    "        num_tokens += tokens_per_message\n",
    "        for key, value in message.items():\n",
    "            num_tokens += len(encoding.encode(value))\n",
    "            if key == \"name\":\n",
    "                num_tokens += tokens_per_name\n",
    "    num_tokens += 3\n",
    "    return num_tokens\n",
    "\n",
    "def num_assistant_tokens_from_messages(messages):\n",
    "    num_tokens = 0\n",
    "    for message in messages:\n",
    "        if message[\"role\"] == \"assistant\":\n",
    "            num_tokens += len(encoding.encode(message[\"content\"]))\n",
    "    return num_tokens\n",
    "\n",
    "def print_distribution(values, name):\n",
    "    print(f\"\\n#### Distribution of {name}:\")\n",
    "    print(f\"min / max: {min(values)}, {max(values)}\")\n",
    "    print(f\"mean / median: {np.mean(values)}, {np.median(values)}\")\n",
    "    print(f\"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "52e58ee4",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num examples missing system message: 1\n",
      "Num examples missing user message: 1\n",
      "\n",
      "#### Distribution of num_messages_per_example:\n",
      "min / max: 2, 9\n",
      "mean / median: 3.8, 3.0\n",
      "p5 / p95: 2.0, 6.6000000000000005\n",
      "\n",
      "#### Distribution of num_total_tokens_per_example:\n",
      "min / max: 26, 8032\n",
      "mean / median: 1648.4, 45.0\n",
      "p5 / p95: 26.8, 4863.6\n",
      "\n",
      "#### Distribution of num_assistant_tokens_per_example:\n",
      "min / max: 4, 8000\n",
      "mean / median: 1610.2, 10.0\n",
      "p5 / p95: 6.0, 4811.200000000001\n",
      "\n",
      "1 examples may be over the 4096 token limit, they will be truncated during fine-tuning\n"
     ]
    }
   ],
   "source": [
    "# Warnings and tokens counts\n",
    "n_missing_system = 0\n",
    "n_missing_user = 0\n",
    "n_messages = []\n",
    "convo_lens = []\n",
    "assistant_message_lens = []\n",
    "\n",
    "for ex in dataset:\n",
    "    messages = ex[\"messages\"]\n",
    "    if not any(message[\"role\"] == \"system\" for message in messages):\n",
    "        n_missing_system += 1\n",
    "    if not any(message[\"role\"] == \"user\" for message in messages):\n",
    "        n_missing_user += 1\n",
    "    n_messages.append(len(messages))\n",
    "    convo_lens.append(num_tokens_from_messages(messages))\n",
    "    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))\n",
    "    \n",
    "print(\"Num examples missing system message:\", n_missing_system)\n",
    "print(\"Num examples missing user message:\", n_missing_user)\n",
    "print_distribution(n_messages, \"num_messages_per_example\")\n",
    "print_distribution(convo_lens, \"num_total_tokens_per_example\")\n",
    "print_distribution(assistant_message_lens, \"num_assistant_tokens_per_example\")\n",
    "n_too_long = sum(l > 4096 for l in convo_lens)\n",
    "print(f\"\\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "fb95a7ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset has ~4306 tokens that will be charged for during training\n",
      "By default, you'll train for 20 epochs on this dataset\n",
      "By default, you'll be charged for ~86120 tokens\n",
      "See pricing page to estimate total costs\n"
     ]
    }
   ],
   "source": [
    "# Pricing and default n_epochs estimate\n",
    "MAX_TOKENS_PER_EXAMPLE = 4096\n",
    "\n",
    "TARGET_EPOCHS = 3\n",
    "MIN_TARGET_EXAMPLES = 100\n",
    "MAX_TARGET_EXAMPLES = 25000\n",
    "MIN_DEFAULT_EPOCHS = 1\n",
    "MAX_DEFAULT_EPOCHS = 25\n",
    "\n",
    "n_epochs = TARGET_EPOCHS\n",
    "n_train_examples = len(dataset)\n",
    "if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:\n",
    "    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)\n",
    "elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:\n",
    "    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)\n",
    "\n",
    "n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)\n",
    "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n",
    "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n",
    "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")\n",
    "print(\"See pricing page to estimate total costs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5736cd86",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ecbb2f6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}