convert scripts: load model as late as possible

gguf_latest_llama
Cebtenzzre 11 months ago committed by Adam Treat
parent fd47088f2b
commit 25297786db

@ -7,7 +7,7 @@ from pathlib import Path
import gguf import gguf
import numpy as np import numpy as np
from transformers import AutoModel, AutoTokenizer from transformers import AutoConfig, AutoModel, AutoTokenizer
if not 2 <= len(sys.argv) < 4: if not 2 <= len(sys.argv) < 4:
@ -44,17 +44,15 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True) config = AutoConfig(dir_model)
hparams = model.config
print(model)
block_count = hparams.num_hidden_layers block_count = config.num_hidden_layers
gguf_writer.add_name("BERT") gguf_writer.add_name("BERT")
gguf_writer.add_context_length(hparams.max_position_embeddings) gguf_writer.add_context_length(config.max_position_embeddings)
gguf_writer.add_embedding_length(hparams.hidden_size) gguf_writer.add_embedding_length(config.hidden_size)
gguf_writer.add_feed_forward_length(hparams.intermediate_size) gguf_writer.add_feed_forward_length(config.intermediate_size)
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams.num_attention_heads) gguf_writer.add_head_count(config.num_attention_heads)
gguf_writer.add_file_type(ftype) gguf_writer.add_file_type(ftype)
print("gguf: get tokenizer metadata") print("gguf: get tokenizer metadata")
@ -76,7 +74,7 @@ reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
# The number of tokens in tokenizer.json can differ from the expected vocab size. # The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference # This causes downstream issues with mismatched tensor sizes when running the inference
for i in range(hparams.vocab_size): for i in range(config.vocab_size):
try: try:
text = reverse_vocab[i] text = reverse_vocab[i]
except KeyError: except KeyError:
@ -94,6 +92,9 @@ special_vocab.add_to_gguf(gguf_writer)
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
model = AutoModel.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
print(model)
tensor_map = gguf.get_tensor_name_map(ARCH, block_count) tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
list_vars = model.state_dict() list_vars = model.state_dict()

@ -80,12 +80,6 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
config = AutoConfig.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name)
print("Loading model:", model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True,
)
config = model.config
print("Model loaded:", model_name)
block_count = config.n_layers block_count = config.n_layers
gguf_writer.add_name("MPT") gguf_writer.add_name("MPT")
@ -129,6 +123,12 @@ gguf_writer.add_token_types(toktypes)
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
print("Loading model:", model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True,
)
print("Model loaded:", model_name)
tensor_map = gguf.get_tensor_name_map(ARCH, block_count) tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
list_vars = model.state_dict() list_vars = model.state_dict()

@ -8,7 +8,7 @@ from pathlib import Path
import gguf import gguf
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
if not 2 <= len(sys.argv) < 4: if not 2 <= len(sys.argv) < 4:
@ -42,9 +42,7 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata") print("gguf: get model metadata")
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True) config = AutoConfig(dir_model)
config = model.config
#print(model)
block_count = config.n_layers block_count = config.n_layers
gguf_writer.add_name("Replit") gguf_writer.add_name("Replit")
@ -95,6 +93,9 @@ special_vocab.add_to_gguf(gguf_writer)
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
model = AutoModelForCausalLM.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
#print(model)
tensor_map = gguf.get_tensor_name_map(ARCH, block_count) tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
list_vars = model.state_dict() list_vars = model.state_dict()

Loading…
Cancel
Save