diff --git a/configs/deepspeed/ds_config_gptj.json b/configs/deepspeed/ds_config_gptj.json index dc856e1b..9e7a410a 100644 --- a/configs/deepspeed/ds_config_gptj.json +++ b/configs/deepspeed/ds_config_gptj.json @@ -1,6 +1,6 @@ { "train_batch_size": "auto", - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": "auto", "train_micro_batch_size_per_gpu": "auto", "fp16": { "enabled": "auto", @@ -19,7 +19,7 @@ "device": "none" }, "offload_optimizer": { - "device": "none" + "device": "cpu" }, "allgather_partitions": true, "allgather_bucket_size": 5e8, diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml index 31c25050..1b42d780 100644 --- a/configs/train/finetune_gptj.yaml +++ b/configs/train/finetune_gptj.yaml @@ -2,24 +2,24 @@ model_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B" gradient_checkpointing: true -save_name: "nomic-ai/gpt4all-gptj-multiturn-lr-aggressive" +save_name: "nomic-ai/gpt4all-gptj-multinode" # dataset streaming: false num_proc: 64 dataset_path: "data_multiplus" max_length: 1024 -batch_size: 8 +batch_size: 32 # train dynamics -lr: 2.0e-5 +lr: 4.0e-5 min_lr: 0 weight_decay: 0.0 -eval_every: 200 +eval_every: 100 eval_steps: 105 -save_every: 400 -log_grads_every: 200 -output_dir: "ckpts/gpt4all-gptj-full-multiturn-lr-aggreive" +save_every: 100 +log_grads_every: 100 +output_dir: "ckpts/gpt4all-gptj-multinode" checkpoint: null lora: false warmup_steps: 500