convert scripts: use bytes_to_unicode from transformers

2024-11-02 09:40:42 +00:00 · 2023-09-29 17:39:49 -04:00 · 2023-09-29 17:39:49 -04:00 · 0493e6eb07
commit 0493e6eb07
parent a49a1dcdf4
2 changed files with 5 additions and 48 deletions
--- a/gpt4all-backend/scripts/convert_gptj_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_gptj_to_gguf.py
@ -27,28 +27,7 @@ from pathlib import Path
 import gguf
 import numpy as np
 from transformers import AutoTokenizer, GPTJConfig, GPTJForCausalLM
-
+from transformers.models.gpt2 import tokenization_gpt2
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    return dict(zip(bs, (chr(n) for n in cs)))
 if not 2 <= len(sys.argv) < 4:
@ -100,7 +79,7 @@ print("gguf: get gpt2 tokenizer vocab")
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
+byte_encoder = tokenization_gpt2.bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 tokens: list[bytearray] = []
--- a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py
@ -18,30 +18,8 @@ from pathlib import Path
 import gguf
 import numpy as np
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
-
+from transformers.models.gpt2 import tokenization_gpt2
 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 if not 3 <= len(sys.argv) < 5:
@ -104,7 +82,7 @@ special_ids = tokenizer.all_special_ids
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 added_tokens = tokenizer.get_added_vocab().values()
-byte_encoder = bytes_to_unicode()
+byte_encoder = tokenization_gpt2.bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 tokens: list[bytearray] = []