gpt4all/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py

#!/usr/bin/env python3
# Convert Hugging Face fine-tuned bloom-like models to ggml format
#
# Usage:
#
#   python3 models/convert-h5-to-ggml.py
#
# This script is similar to "convert-pt-to-ggml.py"
#

from __future__ import annotations

import json
import struct
import sys
from pathlib import Path

import gguf
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, MptConfig
from transformers.models.gpt2 import tokenization_gpt2


if not 3 <= len(sys.argv) < 5:
    print("Usage: {} model-name dir-output [ftype]".format(Path(__file__).name))
    print("  model-name: name of the model to convert. Example: 'bigscience/bloomz-560m'")
    print("  dir-output: directory where the output file will be written")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)

dir_model = Path(sys.argv[1])
dir_out = Path(sys.argv[2])

# make sure the output directory exists
dir_out.mkdir(exist_ok=True)

# possible data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 3:
    ftype = int(sys.argv[3])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)

fname_out = dir_out / f"ggml-model-{dir_model.name}-{ftype_str[ftype]}.gguf"


ARCH = gguf.MODEL_ARCH.MPT
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

print("gguf: get model metadata")

config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)

block_count = config.n_layers
gguf_writer.add_name("MPT")
gguf_writer.add_context_length(config.max_seq_len)
gguf_writer.add_embedding_length(config.d_model)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(4 * config.d_model)
gguf_writer.add_head_count(config.n_heads)
if kv_n_heads := config.attn_config.get('kv_n_heads'):
    gguf_writer.add_head_count_kv(kv_n_heads)
gguf_writer.add_max_alibi_bias(config.attn_config['alibi_bias_max'])
gguf_writer.add_layer_norm_eps(MptConfig().layer_norm_epsilon)  # use default from upstream transformers
gguf_writer.add_file_type(ftype)

clip_qkv = config.attn_config['clip_qkv']
if clip_qkv is not None:
    gguf_writer.add_clamp_kqv(clip_qkv)

print("gguf: get gpt2 tokenizer vocab")

tokenizer = AutoTokenizer.from_pretrained(dir_model)

special_ids = tokenizer.all_special_ids

reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
added_tokens = tokenizer.get_added_vocab().values()
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}

tokens: list[bytearray] = []
toktypes: list[gguf.TokenType] = []

# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
for i in range(config.vocab_size):
    if i not in reverse_vocab:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    elif i in added_tokens:
        # these tokens are not encoded, for some reason
        text = bytearray(reverse_vocab[i].encode('utf-8'))
    else:
        text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])

    tokens.append(text)

    # TODO(cebtenzzre): is there a better way to do this?
    toktypes.append(gguf.TokenType.CONTROL if i in special_ids else gguf.TokenType.NORMAL)

gguf_writer.add_tokenizer_model("gpt2")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab.add_to_gguf(gguf_writer)

print("gguf: get tensor metadata")

print("Loading model:", dir_model)
model = AutoModelForCausalLM.from_pretrained(
    dir_model, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32,
    low_cpu_mem_usage=True, trust_remote_code=True,
)
print("Model loaded:", dir_model)

tensor_map = gguf.get_tensor_name_map(ARCH, block_count)

list_vars = model.state_dict()
for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable:", name, "with shape:", data.shape)

    n_dims = len(data.shape)

    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    # Keep token embeddings in fp32
    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name:
        print("  Converting to float16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1 or data.dtype != np.float32:
        print("  Converting to float32")
        data = data.astype(np.float32)
        ftype_cur = 0

    # map tensor names
    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
    if new_name is None:
        print("Can not map tensor '" + name + "'")
        sys.exit()

    gguf_writer.add_tensor(new_name, data)


print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()

gguf_writer.close()

print(f"gguf: model successfully exported to '{fname_out}'")
print()