mirror of
https://github.com/nomic-ai/gpt4all
synced 2024-11-02 09:40:42 +00:00
Merge branch 'gptj' of github.com:nomic-ai/gpt4all into gptj
This commit is contained in:
commit
bbbf007ed9
17
GPTJ.md
Normal file
17
GPTJ.md
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Inference on Training Data
|
||||||
|
|
||||||
|
|
||||||
|
## Run Inference
|
||||||
|
|
||||||
|
```bash
|
||||||
|
torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Visualizations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python build_map.py
|
||||||
|
```
|
||||||
|
|
||||||
|
will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.
|
@ -1,15 +1,5 @@
|
|||||||
# model/tokenizer
|
# model/tokenizer
|
||||||
model_name: # update with llama 7b
|
model_name: # update with llama model name
|
||||||
tokenizer_name: # update with llama 7b
|
tokenizer_name: # update with llama model name
|
||||||
lora: true
|
lora: true
|
||||||
lora_path: "nomic-ai/gpt4all-lora"
|
lora_path: "nomic-ai/gpt4all-lora"
|
||||||
|
|
||||||
max_new_tokens: 512
|
|
||||||
temperature: 0.001
|
|
||||||
prompt: |
|
|
||||||
#this code prints a string reversed
|
|
||||||
my_string = "hello how are you"
|
|
||||||
print(len(my_string))
|
|
||||||
|
|
||||||
|
|
||||||
My code above does not work. Can you help me?
|
|
||||||
|
@ -2,16 +2,4 @@
|
|||||||
model_name: # update with llama model name
|
model_name: # update with llama model name
|
||||||
tokenizer_name: # update with llama model name
|
tokenizer_name: # update with llama model name
|
||||||
lora: true
|
lora: true
|
||||||
lora_path: "tloen/alpaca-lora-7b"
|
lora_path: "tloen/alpaca-lora-7b"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
max_new_tokens: 512
|
|
||||||
temperature: 0.001
|
|
||||||
prompt: |
|
|
||||||
#this code prints a string reversed
|
|
||||||
my_string = "hello how are you"
|
|
||||||
print(len(my_string))
|
|
||||||
|
|
||||||
|
|
||||||
My code above does not work. Can you help me?
|
|
@ -1,14 +0,0 @@
|
|||||||
# model/tokenizer
|
|
||||||
model_name: # update
|
|
||||||
tokenizer_name: # update
|
|
||||||
lora_path: "no-lora"
|
|
||||||
|
|
||||||
max_new_tokens: 512
|
|
||||||
temperature: 0.001
|
|
||||||
prompt: |
|
|
||||||
#this code prints a string reversed
|
|
||||||
my_string = "hello how are you"
|
|
||||||
print(len(my_string))
|
|
||||||
|
|
||||||
|
|
||||||
My code above does not work. Can you help me?
|
|
4
configs/eval/generate_gpt4all_gptj.yaml
Normal file
4
configs/eval/generate_gpt4all_gptj.yaml
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# model/tokenizer
|
||||||
|
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
|
||||||
|
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||||
|
lora: false
|
5
configs/eval/generate_gpt4all_gptj_lora.yaml
Normal file
5
configs/eval/generate_gpt4all_gptj_lora.yaml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# model/tokenizer
|
||||||
|
model_name: "EleutherAI/gpt-j-6b"
|
||||||
|
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||||
|
lora: true
|
||||||
|
lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1"
|
@ -1,15 +0,0 @@
|
|||||||
# model/tokenizer
|
|
||||||
model_name: # update
|
|
||||||
tokenizer_name: # update
|
|
||||||
lora: true
|
|
||||||
lora_path: # update
|
|
||||||
|
|
||||||
max_new_tokens: 512
|
|
||||||
temperature: 0.001
|
|
||||||
prompt: |
|
|
||||||
#this code prints a string reversed
|
|
||||||
my_string = "hello how are you"
|
|
||||||
print(len(my_string))
|
|
||||||
|
|
||||||
|
|
||||||
My code above does not work. Can you help me?
|
|
@ -1,15 +0,0 @@
|
|||||||
# model/tokenizer
|
|
||||||
model_name: # update
|
|
||||||
tokenizer_name: # update
|
|
||||||
lora: true
|
|
||||||
lora_path: # update
|
|
||||||
|
|
||||||
max_new_tokens: 512
|
|
||||||
temperature: 0.001
|
|
||||||
prompt: |
|
|
||||||
#this code prints a string reversed
|
|
||||||
my_string = "hello how are you"
|
|
||||||
print(len(my_string))
|
|
||||||
|
|
||||||
|
|
||||||
My code above does not work. Can you help me?
|
|
@ -1,11 +1,11 @@
|
|||||||
# model/tokenizer
|
# model/tokenizer
|
||||||
model_name: "nomic-ai/gpt4all-gptj-multinode-deepspeed-finetuned-epoch_0"
|
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
|
||||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||||
|
|
||||||
# dataset
|
# dataset
|
||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: "data_multiplus"
|
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 32
|
batch_size: 32
|
||||||
|
|
||||||
|
@ -2,14 +2,14 @@
|
|||||||
model_name: "EleutherAI/gpt-j-6B"
|
model_name: "EleutherAI/gpt-j-6B"
|
||||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
save_name: "nomic-ai/gpt4all-mosaic"
|
save_name: "nomic-ai/gpt4all-warmup-lr"
|
||||||
|
|
||||||
# dataset
|
# dataset
|
||||||
streaming: false
|
streaming: false
|
||||||
num_proc: 64
|
num_proc: 64
|
||||||
dataset_path: "nomic-ai/turbo-500k-multi"
|
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||||
max_length: 1024
|
max_length: 1024
|
||||||
batch_size: 8
|
batch_size: 32
|
||||||
|
|
||||||
# train dynamics
|
# train dynamics
|
||||||
lr: 2.0e-5
|
lr: 2.0e-5
|
||||||
|
@ -6,18 +6,20 @@ from matplotlib import pyplot as plt
|
|||||||
plt.figure()
|
plt.figure()
|
||||||
for fpath in glob.glob('./eval_data/*.pkl'):
|
for fpath in glob.glob('./eval_data/*.pkl'):
|
||||||
parts = fpath.split('__')
|
parts = fpath.split('__')
|
||||||
model_name = parts[1].replace('model-', '').replace('.pkl', '')
|
model_name = "-".join(fpath.replace(".pkl", "").split("_")[2:])
|
||||||
lora_name = parts[2].replace('lora-', '').replace('.pkl', '')
|
|
||||||
with open(fpath, 'rb') as f:
|
with open(fpath, 'rb') as f:
|
||||||
data = pickle.load(f)
|
data = pickle.load(f)
|
||||||
perplexities = data['perplexities']
|
perplexities = data['perplexities']
|
||||||
perplexities = np.nan_to_num(perplexities, 100)
|
perplexities = np.nan_to_num(perplexities, 100)
|
||||||
perplexities = np.clip(perplexities, 0, 100)
|
perplexities = np.clip(perplexities, 0, 100)
|
||||||
if 'nomic' in fpath:
|
if 'alpaca' not in fpath:
|
||||||
label = 'GPT4all-lora'
|
identifier = model_name = "-".join(fpath.replace(".pkl", "").split("eval__model-")[1:])
|
||||||
|
label = 'GPT4all-'
|
||||||
|
label += identifier
|
||||||
|
|
||||||
else:
|
else:
|
||||||
label = 'alpaca-lora'
|
label = 'alpaca-lora'
|
||||||
plt.hist(perplexities, label=label, alpha=.5)
|
plt.hist(perplexities, label=label, alpha=.5, bins=50)
|
||||||
|
|
||||||
plt.xlabel('Perplexity')
|
plt.xlabel('Perplexity')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
|
@ -49,28 +49,6 @@ def eval_example(model, tokenizer, example, config):
|
|||||||
input = tokenizer(prompt, return_tensors="pt")
|
input = tokenizer(prompt, return_tensors="pt")
|
||||||
input = {k: v.to(model.device) for k, v in input.items()}
|
input = {k: v.to(model.device) for k, v in input.items()}
|
||||||
|
|
||||||
continuations = []
|
|
||||||
tokenized_continuations = []
|
|
||||||
trajectories = []
|
|
||||||
for i in range(1):
|
|
||||||
with torch.no_grad():
|
|
||||||
outputs = model.generate(input_ids=input['input_ids'],
|
|
||||||
max_new_tokens=config["max_new_tokens"],
|
|
||||||
min_new_tokens=5,
|
|
||||||
temperature=config["temperature"],
|
|
||||||
repetition_penalty=1.0,
|
|
||||||
do_sample=True)
|
|
||||||
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
|
|
||||||
|
|
||||||
y = model(input_ids=outputs)
|
|
||||||
trajectory = y.hidden_states[0].detach().cpu().numpy()[0]
|
|
||||||
trajectory = trajectory / np.linalg.norm(trajectory, axis=1, keepdims=True)
|
|
||||||
trajectory = np.cumsum(trajectory, axis=0) / np.arange(1, trajectory.shape[0]+1).reshape(-1, 1)
|
|
||||||
|
|
||||||
trajectories.append(trajectory)
|
|
||||||
continuations.append(decoded)
|
|
||||||
tokenized_continuations.append(tokenizer.tokenize(decoded))
|
|
||||||
|
|
||||||
#compute the ground truth perplexity
|
#compute the ground truth perplexity
|
||||||
gt_input = tokenizer(gt, return_tensors="pt")
|
gt_input = tokenizer(gt, return_tensors="pt")
|
||||||
gt_input = {k: v.to(model.device) for k, v in gt_input.items()}
|
gt_input = {k: v.to(model.device) for k, v in gt_input.items()}
|
||||||
@ -101,30 +79,23 @@ def eval_example(model, tokenizer, example, config):
|
|||||||
|
|
||||||
print(prompt)
|
print(prompt)
|
||||||
print(80*'-')
|
print(80*'-')
|
||||||
for continuation in continuations:
|
|
||||||
print(continuation)
|
|
||||||
print(80*'-')
|
|
||||||
|
|
||||||
return ppl, trajectories, continuations, tokenized_continuations
|
return ppl
|
||||||
|
|
||||||
def do_eval(config):
|
def do_eval(config):
|
||||||
eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl')
|
eval_data = read_jsonl_file('eval_data/user_oriented_instructions.jsonl')
|
||||||
model, tokenizer = setup_model(config)
|
model, tokenizer = setup_model(config)
|
||||||
all_trajectories = []
|
|
||||||
all_perplexities = []
|
all_perplexities = []
|
||||||
all_continuations = []
|
|
||||||
all_tokenized_continuations = []
|
|
||||||
for example in tqdm(eval_data):
|
for example in tqdm(eval_data):
|
||||||
gt_perplexity, trajectories, continuations, tokenized_continuations = eval_example(model, tokenizer, example, config)
|
gt_perplexity = eval_example(model, tokenizer, example, config)
|
||||||
all_trajectories.append(trajectories)
|
|
||||||
all_perplexities.append(gt_perplexity)
|
all_perplexities.append(gt_perplexity)
|
||||||
all_continuations.append(continuations)
|
|
||||||
|
|
||||||
with open('eval_data/eval__model-{}__lora-{}.pkl'.format(config['model_name'].replace('/', '_'), config['lora_path'].replace('/', '_')), 'wb') as f:
|
|
||||||
r = {'trajectories': all_trajectories,
|
name = f"eval_data/eval__model-{config['model_name'].replace('/', '_')}{'__lora-' + config['lora_path'].replace('/', '_') if config['lora'] else ''}.pkl"
|
||||||
'perplexities': all_perplexities,
|
|
||||||
'continuations': all_continuations,
|
with open(name, 'wb') as f:
|
||||||
'tokenized_continuations': all_tokenized_continuations}
|
r = {'perplexities': all_perplexities}
|
||||||
pickle.dump(r, f)
|
pickle.dump(r, f)
|
||||||
|
|
||||||
|
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 26 KiB |
@ -11,4 +11,5 @@ deepspeed
|
|||||||
sentencepiece
|
sentencepiece
|
||||||
jsonlines
|
jsonlines
|
||||||
nomic
|
nomic
|
||||||
scikit-learn
|
scikit-learn
|
||||||
|
matplotlib
|
Loading…
Reference in New Issue
Block a user