From cca9e6ce81540f1ae300a7fe92bfd29deff0b830 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 29 Sep 2023 10:02:04 -0400 Subject: [PATCH] convert_mpt_hf_to_gguf.py: better tokenizer decoding --- .../scripts/convert_mpt_hf_to_gguf.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py index 591a2921..1b5d1367 100644 --- a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py +++ b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py @@ -101,17 +101,27 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) special_ids = tokenizer.all_special_ids +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +added_tokens = tokenizer.get_added_vocab().values() +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} + tokens: list[bytearray] = [] toktypes: list[gguf.TokenType] = [] -# TODO(cebtenzzre): this is probably wrong, but I don't know what else to put here -dot_token = tokenizer.encode('.')[0] - # The number of tokens in tokenizer.json can differ from the expected vocab size. # This causes downstream issues with mismatched tensor sizes when running the inference for i in range(config.vocab_size): - text = tokenizer.decode([dot_token, i]).encode('utf-8') - text = text[1:] # remove the first byte (it's always '.') + if i not in reverse_vocab: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) + elif i in added_tokens: + # these tokens are not encoded, for some reason + text = bytearray(reverse_vocab[i].encode('utf-8')) + else: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + tokens.append(text) # TODO(cebtenzzre): is there a better way to do this?