From 7cfda6a21f2bf7840df7262bf397a06a0e92c978 Mon Sep 17 00:00:00 2001 From: zanussbaum Date: Fri, 7 Apr 2023 16:54:29 -0400 Subject: [PATCH 1/7] feat: update for mosaic --- configs/train/finetune_gptj.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml index f37283b3..aa6bf4a8 100644 --- a/configs/train/finetune_gptj.yaml +++ b/configs/train/finetune_gptj.yaml @@ -2,14 +2,14 @@ model_name: "EleutherAI/gpt-j-6B" tokenizer_name: "EleutherAI/gpt-j-6B" gradient_checkpointing: true -save_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed" +save_name: "nomic-ai/gpt4all-mosaic" # dataset streaming: false num_proc: 64 -dataset_path: "data_multiplus" +dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 -batch_size: 32 +batch_size: 16 # train dynamics lr: 2.0e-5 @@ -23,7 +23,7 @@ output_dir: "ckpts/gpt4all-gptj-multinode" checkpoint: null lora: false warmup_steps: 500 -num_epochs: 4 +num_epochs: 2 # logging wandb: true From 2b001e8932b4893f8b1dc1a1e2498f997618c865 Mon Sep 17 00:00:00 2001 From: zanussbaum Date: Fri, 7 Apr 2023 17:41:45 -0400 Subject: [PATCH 2/7] fix: batch size --- configs/train/finetune_gptj.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/train/finetune_gptj.yaml b/configs/train/finetune_gptj.yaml index aa6bf4a8..ce6feef7 100644 --- a/configs/train/finetune_gptj.yaml +++ b/configs/train/finetune_gptj.yaml @@ -9,7 +9,7 @@ streaming: false num_proc: 64 dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 -batch_size: 16 +batch_size: 8 # train dynamics lr: 2.0e-5 From 147c2fd7ebee2a4dcf0adc97b48162f765bb7ce6 Mon Sep 17 00:00:00 2001 From: zanussbaum Date: Fri, 7 Apr 2023 17:53:07 -0400 Subject: [PATCH 3/7] feat: lora gptj --- configs/train/finetune_gptj_lora.yaml | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 configs/train/finetune_gptj_lora.yaml diff --git a/configs/train/finetune_gptj_lora.yaml b/configs/train/finetune_gptj_lora.yaml new file mode 100644 index 00000000..3bc20cd4 --- /dev/null +++ b/configs/train/finetune_gptj_lora.yaml @@ -0,0 +1,33 @@ +# model/tokenizer +model_name: "EleutherAI/gpt-j-6B" +tokenizer_name: "EleutherAI/gpt-j-6B" +gradient_checkpointing: false +save_name: "nomic-ai/gpt4all-mosaic" + +# dataset +streaming: false +num_proc: 64 +dataset_path: "nomic-ai/turbo-500k-multi" +max_length: 1024 +batch_size: 4 + +# train dynamics +lr: 2.0e-5 +min_lr: 0 +weight_decay: 0.0 +eval_every: 500 +eval_steps: 105 +save_every: 500 +log_grads_every: 500 +output_dir: "ckpts/gpt4all-gptj-multinode" +checkpoint: null +lora: true +warmup_steps: 500 +num_epochs: 2 + +# logging +wandb: true +wandb_entity: zanussbaum +wandb_project_name: mosaic +seed: 42 + From b66f127ade7f324c01d56db4ad7d03a5d5fb3fa8 Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Sat, 8 Apr 2023 20:33:02 +0000 Subject: [PATCH 4/7] fix: config + ignore pkl --- .gitignore | 1 + configs/train/finetune_gptj_lora.yaml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 780ecb37..bfce2c0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.pkl ckpts* .deepspeed_env *.jsonl diff --git a/configs/train/finetune_gptj_lora.yaml b/configs/train/finetune_gptj_lora.yaml index 3bc20cd4..1586b7e7 100644 --- a/configs/train/finetune_gptj_lora.yaml +++ b/configs/train/finetune_gptj_lora.yaml @@ -1,6 +1,6 @@ # model/tokenizer -model_name: "EleutherAI/gpt-j-6B" -tokenizer_name: "EleutherAI/gpt-j-6B" +model_name: "EleutherAI/gpt-j-6b" +tokenizer_name: "EleutherAI/gpt-j-6b" gradient_checkpointing: false save_name: "nomic-ai/gpt4all-mosaic" From be3f528810128a1398407e9ee42519db6e6966da Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Sat, 8 Apr 2023 20:33:51 +0000 Subject: [PATCH 5/7] fix: tokenization error --- data.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/data.py b/data.py index 8a0dd83f..7d61154d 100644 --- a/data.py +++ b/data.py @@ -9,10 +9,6 @@ from transformers import DefaultDataCollator def tokenize_inputs(config, tokenizer, examples): max_length = config["max_length"] - # ignore bos - newline_tokens = tokenizer("\n", return_tensors="pt")["input_ids"][0] - if newline_tokens[0] == tokenizer.bos_token_id: - newline_tokens = newline_tokens[1:] # hacky backward compatible different_eos = tokenizer.eos_token != "" @@ -22,7 +18,7 @@ def tokenize_inputs(config, tokenizer, examples): if response.count("") > 0: response = response.replace("", tokenizer.eos_token) - prompt_len = len(tokenizer(prompt, return_tensors="pt")["input_ids"][0]) + prompt_len = len(tokenizer(prompt + "\n", return_tensors="pt")["input_ids"][0]) # hack if our prompt is super long # we need to include some labels so we arbitrarily trunacate at max_length // 2 @@ -33,7 +29,7 @@ def tokenize_inputs(config, tokenizer, examples): new_len = min(max_length // 2, len(prompt) // 2) prompt = prompt[:new_len] # get new prompt length - prompt_len = tokenizer(prompt, return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item() + prompt_len = tokenizer(prompt + "\n", return_tensors="pt", max_length=max_length // 2, truncation=True).input_ids.ne(tokenizer.pad_token_id).sum().item() assert prompt_len <= max_length // 2, f"prompt length {prompt_len} exceeds max length {max_length}" @@ -41,11 +37,13 @@ def tokenize_inputs(config, tokenizer, examples): truncation=True, max_length=max_length, return_tensors="pt")["input_ids"].squeeze() labels = input_tokens.clone() - labels[:prompt_len + len(newline_tokens)] = -100 + labels[:prompt_len] = -100 if len(labels) < max_length: # pad to max_length with -100 labels = torch.cat([labels, torch.full((max_length - len(labels),), -100)]) + assert (labels == -100).sum() < len(labels), f"Labels are all -100, something wrong. prompt length {prompt_len} exceeds max length {max_length}" + if (labels == -100).sum() == len(labels) - 1: print(prompt) print(response) From c82ee7d882ea9274a05f40f772c71b665b8cb4c8 Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Sat, 8 Apr 2023 20:37:51 +0000 Subject: [PATCH 6/7] fix: add wd + min lr to config --- configs/train/finetune_lora.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/configs/train/finetune_lora.yaml b/configs/train/finetune_lora.yaml index acdc0e95..cf916d3b 100644 --- a/configs/train/finetune_lora.yaml +++ b/configs/train/finetune_lora.yaml @@ -7,12 +7,14 @@ save_name: "nomic-ai/gpt4all-lora-multi-turn" # dataset streaming: false num_proc: 64 -dataset_path: "data_multiturn" +dataset_path: "nomic-ai/turbo-500k-multi" max_length: 1024 batch_size: 4 # train dynamics lr: 5.0e-5 +min_lr: 0 +weight_decay: 0.0 eval_every: 2000 eval_steps: 100 save_every: 2000 From 31195270cbc77f7e63413548bdf9cd5cc28e543f Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Sat, 8 Apr 2023 20:38:10 +0000 Subject: [PATCH 7/7] fix: eos/pad token + wd --- train.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/train.py b/train.py index 463dd155..72e53f4d 100644 --- a/train.py +++ b/train.py @@ -1,8 +1,6 @@ import os -from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler -from transformers.trainer_pt_utils import get_parameter_names +from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM import torch -import torch.nn as nn from torch.optim import AdamW from argparse import ArgumentParser from read import read_config @@ -45,7 +43,7 @@ def train(accelerator, config): accelerator.print(f"Using {accelerator.num_processes} GPUs") tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length']) - # llama has no pad token, set it to new token + # if no pad token, set it to eos if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -76,21 +74,9 @@ def train(accelerator, config): else DummyOptim ) - no_decay = ["bias", "LayerNorm.weight"] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": config["weight_decay"], - }, - { - "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], - "weight_decay": 0.0, - }, - ] - # karpathy doesn't decay embeddding, maybe we should exclude # https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s - optimizer = optimizer_cls(optimizer_grouped_parameters, lr=config["lr"]) + optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"]) if accelerator.state.deepspeed_plugin is not None: gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[