From 0493e6eb07083d15ed698d9bce9501218348f2ad Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 29 Sep 2023 17:39:49 -0400 Subject: [PATCH] convert scripts: use bytes_to_unicode from transformers --- .../scripts/convert_gptj_to_gguf.py | 25 ++--------------- .../scripts/convert_mpt_hf_to_gguf.py | 28 ++----------------- 2 files changed, 5 insertions(+), 48 deletions(-) diff --git a/gpt4all-backend/scripts/convert_gptj_to_gguf.py b/gpt4all-backend/scripts/convert_gptj_to_gguf.py index 0983aa4b..0e3516b4 100644 --- a/gpt4all-backend/scripts/convert_gptj_to_gguf.py +++ b/gpt4all-backend/scripts/convert_gptj_to_gguf.py @@ -27,28 +27,7 @@ from pathlib import Path import gguf import numpy as np from transformers import AutoTokenizer, GPTJConfig, GPTJForCausalLM - - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - return dict(zip(bs, (chr(n) for n in cs))) +from transformers.models.gpt2 import tokenization_gpt2 if not 2 <= len(sys.argv) < 4: @@ -100,7 +79,7 @@ print("gguf: get gpt2 tokenizer vocab") tokenizer = AutoTokenizer.from_pretrained(dir_model) reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} -byte_encoder = bytes_to_unicode() +byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} tokens: list[bytearray] = [] diff --git a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py index a49ceb36..d859348c 100755 --- a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py +++ b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py @@ -18,30 +18,8 @@ from pathlib import Path import gguf import numpy as np import torch -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM - - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig +from transformers.models.gpt2 import tokenization_gpt2 if not 3 <= len(sys.argv) < 5: @@ -104,7 +82,7 @@ special_ids = tokenizer.all_special_ids reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} added_tokens = tokenizer.get_added_vocab().values() -byte_encoder = bytes_to_unicode() +byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} tokens: list[bytearray] = []