mirror of https://github.com/nomic-ai/gpt4all
commit
51264f5eac
@ -1,6 +1,3 @@
|
||||
[submodule "transformers"]
|
||||
path = transformers
|
||||
url = https://github.com/huggingface/transformers.git
|
||||
[submodule "peft"]
|
||||
path = peft
|
||||
url = https://github.com/huggingface/peft.git
|
||||
|
@ -0,0 +1,17 @@
|
||||
# Inference on Training Data
|
||||
|
||||
|
||||
## Run Inference
|
||||
|
||||
```bash
|
||||
torchrun --master_port=29085 --nproc-per-node 8 inference.py --config=configs/inference/gptj.yaml
|
||||
```
|
||||
|
||||
|
||||
## Visualizations
|
||||
|
||||
```bash
|
||||
python build_map.py
|
||||
```
|
||||
|
||||
will build a map in `Atlas`, one using the internal clustering algorithm provided by Nomic and one using the embeddings generated by the finetuned model.
|
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from nomic import atlas
|
||||
import glob
|
||||
from tqdm import tqdm
|
||||
from datasets import load_dataset, concatenate_datasets
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
files = glob.glob("inference/*.jsonl")
|
||||
print(files)
|
||||
df = concatenate_datasets([load_dataset("json", data_files=file, split="train") for file in tqdm(files)])
|
||||
|
||||
print(len(df))
|
||||
print(df)
|
||||
|
||||
df = df.map(lambda example: {"inputs": [prompt + "\n" + response for prompt, response in zip(example["prompt"], example["response"])]},
|
||||
batched=True,
|
||||
num_proc=64)
|
||||
|
||||
df = df.map(lambda example: {"trained_on": [int(t) for t in example["is_train"]]},
|
||||
batched=True,
|
||||
num_proc=64)
|
||||
|
||||
df = df.remove_columns("is_train")
|
||||
|
||||
text = df.remove_columns(["labels", "input_ids", "embeddings"])
|
||||
|
||||
text_df = [text[i] for i in range(len(text))]
|
||||
|
||||
atlas.map_text(text_df, indexed_field="inputs",
|
||||
name="CHANGE ME!",
|
||||
colorable_fields=["source", "loss", "trained_on"],
|
||||
reset_project_if_exists=True,
|
||||
)
|
||||
|
||||
# index is local to train/test split, regenerate
|
||||
data = df.remove_columns(["labels", "input_ids", "index"])
|
||||
data = data.add_column("index", list(range(len(data))))
|
||||
# max embed dim is 2048 for now
|
||||
# note! this is slow in pyarrow/hf datasets
|
||||
embeddings = np.array(data["embeddings"])
|
||||
print("embeddings shape:", embeddings.shape)
|
||||
embeddings = PCA(n_components=2048).fit_transform(embeddings)
|
||||
|
||||
data = data.remove_columns(["embeddings"])
|
||||
columns = data.to_pandas().to_dict("records")
|
||||
|
||||
atlas.map_embeddings(embeddings,
|
||||
data=columns,
|
||||
id_field="index",
|
||||
name="CHANGE ME!",
|
||||
colorable_fields=["source", "loss", "trained_on"],
|
||||
build_topic_model=True,
|
||||
topic_label_field="inputs",
|
||||
reset_project_if_exists=True,)
|
@ -0,0 +1,48 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"min_loss_scale": 1,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"initial_scale_power": 32
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"gradient_clipping": 1.0,
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_param": {
|
||||
"device": "none"
|
||||
},
|
||||
"offload_optimizer": {
|
||||
"device": "none"
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": [
|
||||
0.9,
|
||||
0.999
|
||||
],
|
||||
"eps": 1e-08
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": 0,
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"warmup_type": "linear"
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
{
|
||||
"train_batch_size": "auto",
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"min_loss_scale": 1,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"initial_scale_power": 32
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"gradient_clipping": 1,
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_param": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 5e8,
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": [
|
||||
0.9,
|
||||
0.999
|
||||
],
|
||||
"eps": 1e-08
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": 0,
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"warmup_type": "linear"
|
||||
}
|
||||
}
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
# model/tokenizer
|
||||
model_name: # update with llama 7b
|
||||
tokenizer_name: # update with llama 7b
|
||||
lora: true
|
||||
lora_path: "nomic-ai/gpt4all-lora"
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0.001
|
||||
prompt: |
|
||||
#this code prints a string reversed
|
||||
my_string = "hello how are you"
|
||||
print(len(my_string))
|
||||
|
||||
|
||||
My code above does not work. Can you help me?
|
@ -1,17 +1,5 @@
|
||||
# model/tokenizer
|
||||
model_name: # update with llama model name
|
||||
tokenizer_name: # update with llama model name
|
||||
model_name: "zpn/llama-7b"
|
||||
tokenizer_name: "zpn/llama-7b"
|
||||
lora: true
|
||||
lora_path: "tloen/alpaca-lora-7b"
|
||||
|
||||
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0.001
|
||||
prompt: |
|
||||
#this code prints a string reversed
|
||||
my_string = "hello how are you"
|
||||
print(len(my_string))
|
||||
|
||||
|
||||
My code above does not work. Can you help me?
|
||||
lora_path: "tloen/alpaca-lora-7b"
|
@ -0,0 +1,4 @@
|
||||
# model/tokenizer
|
||||
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_0"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||
lora: false
|
@ -0,0 +1,5 @@
|
||||
# model/tokenizer
|
||||
model_name: "EleutherAI/gpt-j-6b"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||
lora: true
|
||||
lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_1"
|
@ -0,0 +1,5 @@
|
||||
# model/tokenizer
|
||||
model_name: "zpn/llama-7b"
|
||||
tokenizer_name: "zpn/llama-7b"
|
||||
lora: true
|
||||
lora_path: "nomic-ai/gpt4all-lora"
|
@ -1,15 +0,0 @@
|
||||
# model/tokenizer
|
||||
model_name: # update
|
||||
tokenizer_name: # update
|
||||
lora: true
|
||||
lora_path: # update
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0.001
|
||||
prompt: |
|
||||
#this code prints a string reversed
|
||||
my_string = "hello how are you"
|
||||
print(len(my_string))
|
||||
|
||||
|
||||
My code above does not work. Can you help me?
|
@ -1,15 +0,0 @@
|
||||
# model/tokenizer
|
||||
model_name: # update
|
||||
tokenizer_name: # update
|
||||
lora: true
|
||||
lora_path: # update
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0.001
|
||||
prompt: |
|
||||
#this code prints a string reversed
|
||||
my_string = "hello how are you"
|
||||
print(len(my_string))
|
||||
|
||||
|
||||
My code above does not work. Can you help me?
|
@ -1,7 +1,8 @@
|
||||
# model/tokenizer
|
||||
model_name: # update
|
||||
tokenizer_name: # update
|
||||
lora_path: "no-lora"
|
||||
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||
lora: false
|
||||
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0.001
|
@ -0,0 +1,15 @@
|
||||
# model/tokenizer
|
||||
model_name: "EleutherAI/gpt-j-6b"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||
lora: true
|
||||
lora_path: "nomic-ai/gpt4all-gptj-lora-epoch_0"
|
||||
|
||||
max_new_tokens: 512
|
||||
temperature: 0
|
||||
prompt: |
|
||||
#this code prints a string reversed
|
||||
my_string = "hello how are you"
|
||||
print(len(my_string))
|
||||
|
||||
|
||||
My code above does not work. Can you help me?
|
@ -0,0 +1,14 @@
|
||||
# model/tokenizer
|
||||
model_name: "nomic-ai/gpt4all-warmup-lr-epoch_1"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||
|
||||
# dataset
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: "nomic-ai/turbo-500k-multi"
|
||||
max_length: 1024
|
||||
batch_size: 32
|
||||
|
||||
# logging
|
||||
seed: 42
|
||||
|
@ -0,0 +1,33 @@
|
||||
# model/tokenizer
|
||||
model_name: "EleutherAI/gpt-j-6B"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6B"
|
||||
gradient_checkpointing: true
|
||||
save_name: # CHANGE
|
||||
|
||||
# dataset
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # CHANGE
|
||||
max_length: 1024
|
||||
batch_size: 32
|
||||
|
||||
# train dynamics
|
||||
lr: 2.0e-5
|
||||
min_lr: 0
|
||||
weight_decay: 0.0
|
||||
eval_every: 500
|
||||
eval_steps: 105
|
||||
save_every: 500
|
||||
log_grads_every: 100
|
||||
output_dir: # CHANGE
|
||||
checkpoint: null
|
||||
lora: false
|
||||
warmup_steps: 500
|
||||
num_epochs: 2
|
||||
|
||||
# logging
|
||||
wandb: true
|
||||
wandb_entity: # CHANGE
|
||||
wandb_project_name: # CHANGE
|
||||
seed: 42
|
||||
|
@ -0,0 +1,33 @@
|
||||
# model/tokenizer
|
||||
model_name: "EleutherAI/gpt-j-6b"
|
||||
tokenizer_name: "EleutherAI/gpt-j-6b"
|
||||
gradient_checkpointing: false
|
||||
save_name: # CHANGE
|
||||
|
||||
# dataset
|
||||
streaming: false
|
||||
num_proc: 64
|
||||
dataset_path: # CHANGE
|
||||
max_length: 1024
|
||||
batch_size: 1
|
||||
|
||||
# train dynamics
|
||||
lr: 2.0e-5
|
||||
min_lr: 0
|
||||
weight_decay: 0.0
|
||||
eval_every: 500
|
||||
eval_steps: 105
|
||||
save_every: 500
|
||||
log_grads_every: 500
|
||||
output_dir: # CHANGE
|
||||
checkpoint: null
|
||||
lora: true
|
||||
warmup_steps: 500
|
||||
num_epochs: 2
|
||||
|
||||
# logging
|
||||
wandb: true
|
||||
wandb_entity: # CHANGE
|
||||
wandb_project_name: # CHANGE
|
||||
seed: 42
|
||||
|
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
export WORKER_IP=$1
|
||||
N_GPUS=8
|
||||
# create dir if doesn't exist
|
||||
sudo mkdir -p /job
|
||||
printf "localhost slots=$N_GPUS\n$WORKER_IP slots=$N_GPUS" | sudo tee /job/hostfile
|
||||
echo /job/hostfile
|
Binary file not shown.
After Width: | Height: | Size: 2.3 MiB |
Binary file not shown.
After Width: | Height: | Size: 356 KiB |
@ -0,0 +1,204 @@
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from argparse import ArgumentParser
|
||||
from read import read_config
|
||||
from accelerate.utils import set_seed
|
||||
from data import load_data_for_inference
|
||||
from tqdm import tqdm
|
||||
from datasets import Dataset
|
||||
import torch.distributed as dist
|
||||
from transformers.trainer_pt_utils import nested_numpify
|
||||
from transformers import DefaultDataCollator
|
||||
from torch.utils.data import DataLoader, DistributedSampler
|
||||
import numpy as np
|
||||
import pyarrow as pa
|
||||
from pyarrow import compute as pc
|
||||
|
||||
|
||||
def calc_cross_entropy_no_reduction(lm_logits, labels):
|
||||
# calculate cross entropy across batch dim
|
||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
# Flatten the tokens
|
||||
loss_fct = nn.CrossEntropyLoss(reduction='none')
|
||||
loss = loss_fct(shift_logits.permute(0, 2, 1), shift_labels).mean(dim=1)
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
def rank0_print(msg):
|
||||
if dist.get_rank() == 0:
|
||||
print(msg)
|
||||
|
||||
|
||||
def inference(config):
|
||||
set_seed(config['seed'])
|
||||
|
||||
rank0_print(f"World size: {dist.get_world_size()}")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
|
||||
# llama has no pad token, set it to new token
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
|
||||
train_dataset, val_dataset = load_data_for_inference(config, tokenizer)
|
||||
|
||||
num_processes = dist.get_world_size()
|
||||
local_rank = dist.get_rank()
|
||||
|
||||
train_sampler = DistributedSampler(train_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank)
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset,
|
||||
collate_fn=DefaultDataCollator(),
|
||||
batch_size=config["batch_size"],
|
||||
sampler=train_sampler,
|
||||
drop_last=True
|
||||
)
|
||||
|
||||
val_sampler = DistributedSampler(val_dataset, shuffle=False, drop_last=True, num_replicas=num_processes, rank=local_rank)
|
||||
val_dataloader = DataLoader(
|
||||
val_dataset,
|
||||
collate_fn=DefaultDataCollator(),
|
||||
batch_size=config["batch_size"],
|
||||
sampler=val_sampler,
|
||||
drop_last=True
|
||||
)
|
||||
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(config["model_name"],
|
||||
trust_remote_code=True,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
model.to(f"cuda:{local_rank}")
|
||||
|
||||
with torch.no_grad():
|
||||
train_outputs = {"loss": [], "embeddings": [], "index": []}
|
||||
for batch in tqdm(train_dataloader, disable=local_rank != 0):
|
||||
batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}")
|
||||
batch["labels"] = batch["labels"].to(f"cuda:{local_rank}")
|
||||
outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True)
|
||||
loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"])
|
||||
train_outputs["loss"].extend(loss)
|
||||
|
||||
embeddings = outputs.hidden_states[-1]
|
||||
batch_size = batch["input_ids"].shape[0]
|
||||
sequence_lengths = []
|
||||
# since we use mutiturn with multiple <|endoftext|>, we need to find the place where
|
||||
# <|endoftext|> is repeated
|
||||
for item in batch["input_ids"]:
|
||||
indices = torch.where(item == tokenizer.pad_token_id)[0]
|
||||
found = False
|
||||
for index in indices:
|
||||
# case where sequence is less than max length
|
||||
if torch.all(item[index:] == tokenizer.pad_token_id):
|
||||
sequence_lengths.append(index)
|
||||
found = True
|
||||
break
|
||||
# case where sequence is >= max length
|
||||
if not found:
|
||||
sequence_lengths.append(len(item) - 1)
|
||||
|
||||
sequence_lengths = torch.tensor(sequence_lengths)
|
||||
pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths]
|
||||
|
||||
train_outputs["embeddings"].append(pooled_logits)
|
||||
train_outputs["index"].extend(batch["index"].to(model.device))
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
train_outputs = nested_numpify(train_outputs)
|
||||
# stack since they're 0-dim arrays
|
||||
train_outputs["index"] = np.stack(train_outputs["index"])
|
||||
train_outputs["loss"] = np.stack(train_outputs["loss"])
|
||||
train_outputs["embeddings"] = np.concatenate(train_outputs["embeddings"])
|
||||
|
||||
df_train = Dataset.from_dict(train_outputs)
|
||||
curr_idx = df_train["index"]
|
||||
|
||||
# compute mask in pyarrow since it's super fast
|
||||
# ty @bmschmidt for showing me this!
|
||||
table = train_dataset.data
|
||||
mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32()))
|
||||
filtered_table = table.filter(mask)
|
||||
# convert from pyarrow to Dataset
|
||||
filtered_train = Dataset.from_dict(filtered_table.to_pydict())
|
||||
|
||||
filtered_train = filtered_train.add_column("embeddings", df_train["embeddings"])
|
||||
filtered_train = filtered_train.add_column("loss", df_train["loss"])
|
||||
filtered_train = filtered_train.add_column("is_train", [True] * len(filtered_train))
|
||||
|
||||
filtered_train.to_json(f"inference/epoch_2_embeddings_train_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64)
|
||||
|
||||
val_outputs = {"loss": [], "embeddings": [], "index": []}
|
||||
for batch in tqdm(val_dataloader, disable=local_rank != 0):
|
||||
batch["input_ids"] = batch["input_ids"].to(f"cuda:{local_rank}")
|
||||
batch["labels"] = batch["labels"].to(f"cuda:{local_rank}")
|
||||
outputs = model(input_ids=batch["input_ids"], labels=batch["labels"], output_hidden_states=True)
|
||||
loss = calc_cross_entropy_no_reduction(outputs.logits, batch["labels"])
|
||||
val_outputs["loss"].extend(loss)
|
||||
|
||||
embeddings = outputs.hidden_states[-1]
|
||||
batch_size = batch["input_ids"].shape[0]
|
||||
sequence_lengths = []
|
||||
# since we use mutiturn with multiple <|endoftext|>, we need to find the place where
|
||||
# <|endoftext|> is repeated
|
||||
for item in batch["input_ids"]:
|
||||
indices = torch.where(item == tokenizer.pad_token_id)[0]
|
||||
found = False
|
||||
for index in indices:
|
||||
# case where sequence is less than max length
|
||||
if torch.all(item[index:] == tokenizer.pad_token_id):
|
||||
sequence_lengths.append(index)
|
||||
found = True
|
||||
break
|
||||
# case where sequence is >= max length
|
||||
if not found:
|
||||
sequence_lengths.append(len(item) - 1)
|
||||
|
||||
sequence_lengths = torch.tensor(sequence_lengths)
|
||||
pooled_logits = embeddings[torch.arange(batch_size, device=embeddings.device), sequence_lengths]
|
||||
|
||||
val_outputs["embeddings"].append(pooled_logits)
|
||||
val_outputs["index"].extend(batch["index"].to(model.device))
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
val_outputs = nested_numpify(val_outputs)
|
||||
val_outputs["index"] = np.stack(val_outputs["index"])
|
||||
val_outputs["loss"] = np.stack(val_outputs["loss"])
|
||||
val_outputs["embeddings"] = np.concatenate(val_outputs["embeddings"])
|
||||
|
||||
df_val = Dataset.from_dict(val_outputs)
|
||||
curr_idx = df_val["index"]
|
||||
|
||||
# compute mask in pyarrow since it's super fast
|
||||
# ty @bmschmidt for showing me this!
|
||||
table = val_dataset.data
|
||||
mask = pc.is_in(table['index'], value_set=pa.array(curr_idx, pa.int32()))
|
||||
filtered_table = table.filter(mask)
|
||||
# convert from pyarrow to Dataset
|
||||
filtered_val = Dataset.from_dict(filtered_table.to_pydict())
|
||||
filtered_val = filtered_val.add_column("embeddings", df_val["embeddings"])
|
||||
filtered_val = filtered_val.add_column("loss", df_val["loss"])
|
||||
filtered_val = filtered_val.add_column("is_train", [False] * len(filtered_val))
|
||||
|
||||
filtered_val.to_json(f"inference/epoch_2_embeddings_val_shard_{local_rank}.jsonl", lines=True, orient="records", num_proc=64)
|
||||
|
||||
|
||||
def main():
|
||||
dist.init_process_group("nccl")
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument("--config", type=str, default="config.yaml")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = read_config(args.config)
|
||||
|
||||
inference(config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parse arguments by reading in a config
|
||||
main()
|
||||
|
@ -1 +0,0 @@
|
||||
Subproject commit cae78c46d658a8e496a815c2ee49b9b178fb9c9a
|
Loading…
Reference in New Issue