diff --git a/Fine_tune_Llama_2_in_Google_Colab.ipynb b/Fine_tune_Llama_2_in_Google_Colab.ipynb index e65d5ec..2fc8bc7 100644 --- a/Fine_tune_Llama_2_in_Google_Colab.ipynb +++ b/Fine_tune_Llama_2_in_Google_Colab.ipynb @@ -6,7 +6,7 @@ "provenance": [], "machine_shape": "hm", "gpuType": "V100", - "authorship_tag": "ABX9TyPHtqq96zm8/DDNC9+543fd", + "authorship_tag": "ABX9TyPNl/WKBYXOzuJCP/puYm6d", "include_colab_link": true }, "kernelspec": { @@ -35,7 +35,9 @@ "# Fine-tune Llama 2 in Google Colab\n", "> 🗣️ Large Language Model Course\n", "\n", - "❤️ Created by [@maximelabonne](), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).\n" + "❤️ Created by [@maximelabonne](), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).\n", + "\n", + "This notebook runs on a T4 GPU with high RAM. (Last update: 23 Jul 2023)\n" ], "metadata": { "id": "OSHlAbqzDFDq" @@ -79,78 +81,110 @@ { "cell_type": "code", "source": [ - "# Used for multi-gpu\n", - "local_rank = -1\n", - "per_device_train_batch_size = 4\n", - "per_device_eval_batch_size = 1\n", - "gradient_accumulation_steps = 4\n", - "learning_rate = 2e-4\n", - "max_grad_norm = 0.3\n", - "weight_decay = 0.001\n", - "lora_alpha = 16\n", - "lora_dropout = 0.1\n", - "lora_r = 64\n", - "max_seq_length = 512\n", - "\n", "# The model that you want to train from the Hugging Face hub\n", "model_name = \"daryl149/llama-2-7b-chat-hf\"\n", "\n", + "# The instruction dataset to use\n", + "dataset_name = \"mlabonne/guanaco-llama2-1k\"\n", + "\n", "# Fine-tuned model name\n", "new_model = \"llama-2-7b-guanaco\"\n", "\n", - "# The instruction dataset to use\n", - "dataset_name = \"timdettmers/openassistant-guanaco\"\n", + "################################################################################\n", + "# QLoRA parameters\n", + "################################################################################\n", + "\n", + "# Lora attention dimension\n", + "lora_r = 64\n", + "\n", + "# Alpha parameter for Lora scaling\n", + "lora_alpha = 16\n", + "\n", + "# Dropout probability for Lora layers\n", + "lora_dropout = 0.1\n", + "\n", + "################################################################################\n", + "# bitsandbytes parameters\n", + "################################################################################\n", "\n", "# Activate 4-bit precision base model loading\n", "use_4bit = True\n", "\n", - "# Activate nested quantization for 4-bit base models\n", - "use_nested_quant = False\n", - "\n", "# Compute dtype for 4-bit base models\n", "bnb_4bit_compute_dtype = \"float16\"\n", "\n", - "# Quantization type (fp4 or nf4=\n", + "# Quantization type (fp4 or nf4)\n", "bnb_4bit_quant_type = \"nf4\"\n", "\n", + "# Activate nested quantization for 4-bit base models (double quantization)\n", + "use_nested_quant = False\n", + "\n", + "################################################################################\n", + "# TrainingArguments parameters\n", + "################################################################################\n", + "\n", + "# Output directory where the model predictions and checkpoints will be stored\n", + "output_dir = \"./results\"\n", + "\n", "# Number of training epochs\n", "num_train_epochs = 1\n", "\n", - "# Enable fp16 training\n", + "# Enable fp16/bf16 training (set bf16 to True with an A100)\n", "fp16 = False\n", - "\n", - "# Enable bf16 training\n", "bf16 = False\n", "\n", - "# Use packing dataset creating\n", - "packing = False\n", + "# Batch size per GPU for training\n", + "per_device_train_batch_size = 4\n", + "\n", + "# Batch size per GPU for evaluation\n", + "per_device_eval_batch_size = 4\n", + "\n", + "# Number of update steps to accumulate the gradients for\n", + "gradient_accumulation_steps = 1\n", "\n", "# Enable gradient checkpointing\n", "gradient_checkpointing = True\n", "\n", + "# Maximum gradient normal (gradient clipping)\n", + "max_grad_norm = 0.3\n", + "\n", + "# Initial learning rate (AdamW optimizer)\n", + "learning_rate = 2e-4\n", + "\n", + "# Weight decay to apply to all layers except bias/LayerNorm weights\n", + "weight_decay = 0.001\n", + "\n", "# Optimizer to use\n", "optim = \"paged_adamw_32bit\"\n", "\n", - "# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)\n", + "# Learning rate schedule (constant a bit better than cosine)\n", "lr_scheduler_type = \"constant\"\n", "\n", - "# Number of optimizer update steps\n", - "max_steps = 10000\n", + "# Number of training steps (overrides num_train_epochs)\n", + "max_steps = -1\n", "\n", - "# Fraction of steps to do a warmup for\n", + "# Ratio of steps for a linear warmup (from 0 to learning rate)\n", "warmup_ratio = 0.03\n", "\n", - "# Group sequences into batches with same length (saves memory and speeds up training considerably)\n", + "# Group sequences into batches with same length\n", + "# Saves memory and speeds up training considerably\n", "group_by_length = True\n", "\n", "# Save checkpoint every X updates steps\n", "save_steps = 10\n", "\n", "# Log every X updates steps\n", - "logging_steps = 10\n", + "logging_steps = 1\n", "\n", - "# The output directory where the model predictions and checkpoints will be written\n", - "output_dir = \"./results\"\n", + "################################################################################\n", + "# SFT parameters\n", + "################################################################################\n", + "\n", + "# Maximum sequence length to use\n", + "max_seq_length = None\n", + "\n", + "# Pack multiple short examples in the same input sequence to increase efficiency\n", + "packing = False\n", "\n", "# Load the entire model on the GPU 0\n", "device_map = {\"\": 0}" @@ -164,6 +198,7 @@ { "cell_type": "code", "source": [ + "# Load dataset (you can process it here)\n", "dataset = load_dataset(dataset_name, split=\"train\")\n", "\n", "# Load tokenizer and model with QLoRA configuration\n", @@ -176,13 +211,15 @@ " bnb_4bit_use_double_quant=use_nested_quant,\n", ")\n", "\n", + "# Check GPU compatibility with bfloat16\n", "if compute_dtype == torch.float16 and use_4bit:\n", " major, _ = torch.cuda.get_device_capability()\n", " if major >= 8:\n", " print(\"=\" * 80)\n", - " print(\"Your GPU supports bfloat16, you can accelerate training with the argument --bf16\")\n", + " print(\"Your GPU supports bfloat16: accelerate training with bf16=True\")\n", " print(\"=\" * 80)\n", "\n", + "# Load base model\n", "model = AutoModelForCausalLM.from_pretrained(\n", " model_name,\n", " quantization_config=bnb_config,\n", @@ -191,6 +228,7 @@ "model.config.use_cache = False\n", "model.config.pretraining_tp = 1\n", "\n", + "# Load LoRA configuration\n", "peft_config = LoraConfig(\n", " lora_alpha=lora_alpha,\n", " lora_dropout=lora_dropout,\n", @@ -199,19 +237,22 @@ " task_type=\"CAUSAL_LM\",\n", ")\n", "\n", + "# Load LLaMA tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", - "# Fix weird overflow issue with fp16 training\n", - "tokenizer.padding_side = \"right\"\n", + "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training\n", "\n", + "# Set training parameters\n", "training_arguments = TrainingArguments(\n", " output_dir=output_dir,\n", + " num_train_epochs=num_train_epochs,\n", " per_device_train_batch_size=per_device_train_batch_size,\n", " gradient_accumulation_steps=gradient_accumulation_steps,\n", " optim=optim,\n", " save_steps=save_steps,\n", " logging_steps=logging_steps,\n", " learning_rate=learning_rate,\n", + " weight_decay=weight_decay,\n", " fp16=fp16,\n", " bf16=bf16,\n", " max_grad_norm=max_grad_norm,\n", @@ -219,8 +260,10 @@ " warmup_ratio=warmup_ratio,\n", " group_by_length=group_by_length,\n", " lr_scheduler_type=lr_scheduler_type,\n", + " report_to=\"tensorboard\"\n", ")\n", "\n", + "# Set supervised fine-tuning parameters\n", "trainer = SFTTrainer(\n", " model=model,\n", " train_dataset=dataset,\n", @@ -232,7 +275,10 @@ " packing=packing,\n", ")\n", "\n", + "# Train model\n", "trainer.train()\n", + "\n", + "# Save trained model\n", "trainer.model.save_pretrained(output_dir)" ], "metadata": { @@ -267,29 +313,21 @@ { "cell_type": "code", "source": [ - "from numba import cuda\n", - "\n", - "if use_4bit:\n", - " del model\n", - " torch.cuda.empty_cache()\n", - " cuda.select_device(0)\n", - " cuda.close()\n", - "\n", - " base_model = AutoModelForCausalLM.from_pretrained(\n", - " model_name,\n", - " low_cpu_mem_usage=True,\n", - " return_dict=True,\n", - " torch_dtype=torch.float16,\n", - " device_map=device_map,\n", - " )\n", - " model = PeftModel.from_pretrained(base_model, output_dir, offload_folder=\"/content/sample_data\")\n", - " model = model.merge_and_unload()\n", - "\n", - "# Save merged weights and tokenizer\n", - "model.save_pretrained(new_model, use_safetensors=True)\n", + "# Reload model in FP16 and merge it with LoRA weights\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " low_cpu_mem_usage=True,\n", + " return_dict=True,\n", + " torch_dtype=torch.float16,\n", + " device_map=device_map,\n", + ")\n", + "model = PeftModel.from_pretrained(base_model, output_dir)\n", + "model = model.merge_and_unload()\n", + "\n", + "# Reload tokenizer to save it\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n", - "tokenizer.save_pretrained(new_model)" + "tokenizer.padding_side = \"right\"" ], "metadata": { "id": "QQn30cRtAZ-P"