diff --git a/gpt4all-backend/mpt.cpp b/gpt4all-backend/mpt.cpp index f493dd5f..d316c03b 100644 --- a/gpt4all-backend/mpt.cpp +++ b/gpt4all-backend/mpt.cpp @@ -1,6 +1,8 @@ #include "mpt.h" #include "llama.cpp/ggml.h" +#include "utils.h" + #include #include #include @@ -136,26 +138,8 @@ static bool kv_cache_init( return true; } -struct mpt_vocab { - using id = int32_t; - using token = std::string; - - std::map token_to_id; - std::map id_to_token; - std::vector special_tokens; - - void add_special_token(const std::string &token) { - special_tokens.push_back(token); - } -}; - -std::string regex_escape(const std::string &s) { - static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])"); - return std::regex_replace(s, metacharacters, "\\$&"); -} - // load the model's weights from a stream -bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, mpt_vocab & vocab) { +bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); // verify magic @@ -219,8 +203,6 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod vocab.id_to_token[i] = word; } - // TODO: this only kind-of works, the gpt_tokenize can still incorrectly - // tokenize special tokens if(special) { vocab.add_special_token(word); } @@ -436,7 +418,7 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod } // load the model's weights from a file path -bool mpt_model_load(const std::string & fname, mpt_model & model, mpt_vocab & vocab) { +bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { @@ -647,98 +629,6 @@ bool mpt_eval( return true; } -std::vector mpt_tokenize_inner(const mpt_vocab & vocab, const std::string & text) { - // taken from stablelm example in ggml - // they both use the gpt-neox tokenizer - // not sure if this entirely right? - std::vector words; - - - // first split the text into words - { - std::string str = text; - std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; - std::regex re(pat); - std::smatch m; - - while (std::regex_search(str, m, re)) { - for (auto x : m) { - words.push_back(x); - } - str = m.suffix(); - } - } - - // find the longest tokens that form the words: - std::vector tokens; - for (const auto & word : words) { - if (word.size() == 0) continue; - - int i = 0; - int n = word.size(); - while (i < n) { - int j = n; - while (j > i) { - auto it = vocab.token_to_id.find(word.substr(i, j-i)); - if (it != vocab.token_to_id.end()) { - tokens.push_back(it->second); - i = j; - break; - } - --j; - } - if (i == n) { - break; - } - if (j == i) { - auto sub = word.substr(i, 1); - if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { - tokens.push_back(vocab.token_to_id.at(sub)); - } else { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); - } - ++i; - } - } - } - - return tokens; -} - -std::vector mpt_tokenize(const mpt_vocab & vocab, const std::string & text) { - // Generate the subpattern from the special_tokens vector if it's not empty - if (!vocab.special_tokens.empty()) { - std::vector out; - std::vector chunks; - std::string str = text; - std::string special_tokens_subpattern; - for (const auto &token : vocab.special_tokens) { - if (!special_tokens_subpattern.empty()) { - special_tokens_subpattern += "|"; - } - special_tokens_subpattern += regex_escape(token); - } - std::regex re(special_tokens_subpattern); - std::smatch m; - while (std::regex_search(str, m, re)) { - auto tok = vocab.token_to_id.find(m.str()); - if (tok != vocab.token_to_id.end()) { - auto tokid = tok->second; - auto pfxtoks = mpt_tokenize_inner(vocab, m.prefix()); - out.insert(out.end(), pfxtoks.begin(), pfxtoks.end()); - out.push_back(tokid); - str = m.suffix(); - } - } - if (!str.empty()) { - auto tokrest = mpt_tokenize_inner(vocab, str); - out.insert(out.end(), tokrest.begin(), tokrest.end()); - } - return out; - } else { - return mpt_tokenize_inner(vocab, text); - } -} #define MPT_MAX_RNG_STATE 64*1024 @@ -801,8 +691,8 @@ size_t mpt_copy_state_data(const mpt_model &model, const std::mt19937 &rng, uint return written; } -mpt_vocab::id mpt_sample_top_k_top_p( - const mpt_vocab & vocab, +gpt_vocab::id mpt_sample_top_k_top_p( + const gpt_vocab & vocab, const size_t actualVocabSize, const int32_t * last_n_tokens_data, int last_n_tokens_size, @@ -817,7 +707,7 @@ mpt_vocab::id mpt_sample_top_k_top_p( const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size); const auto * plogits = logits.data() + logits.size() - n_logits; - std::vector> logits_id; + std::vector> logits_id; logits_id.reserve(n_logits); { @@ -842,7 +732,7 @@ mpt_vocab::id mpt_sample_top_k_top_p( std::partial_sort( logits_id.begin(), logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { + [](const std::pair & a, const std::pair & b) { return a.first > b.first; }); @@ -952,7 +842,7 @@ size_t mpt_set_state_data(mpt_model *model, std::mt19937 *rng, const uint8_t *sr struct MPTPrivate { const std::string modelPath; bool modelLoaded; - mpt_vocab vocab; + gpt_vocab vocab; mpt_model *model = nullptr; int64_t n_threads = 0; size_t mem_per_token = 0; @@ -1037,7 +927,7 @@ void MPT::prompt(const std::string &prompt, int64_t t_prompt_us = 0; // tokenize the prompt - std::vector embd_inp = mpt_tokenize(d_ptr->vocab, prompt); + std::vector embd_inp = gpt_tokenize(d_ptr->vocab, prompt); // save the context size promptCtx.n_ctx = d_ptr->model->hparams.n_ctx; diff --git a/gpt4all-backend/utils.cpp b/gpt4all-backend/utils.cpp index b9b653f5..b1cb113b 100644 --- a/gpt4all-backend/utils.cpp +++ b/gpt4all-backend/utils.cpp @@ -102,7 +102,7 @@ std::map json_parse(const std::string & fname) { return result; } -std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { +std::vector gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) { std::vector words; // first split the text into words @@ -157,6 +157,47 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri return tokens; } +std::string regex_escape(const std::string &s) { + static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])"); + return std::regex_replace(s, metacharacters, "\\$&"); +} + +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { + // Generate the subpattern from the special_tokens vector if it's not empty + if (!vocab.special_tokens.empty()) { + std::vector out; + std::vector chunks; + std::string str = text; + std::string special_tokens_subpattern; + for (const auto &token : vocab.special_tokens) { + if (!special_tokens_subpattern.empty()) { + special_tokens_subpattern += "|"; + } + special_tokens_subpattern += regex_escape(token); + } + std::regex re(special_tokens_subpattern); + std::smatch m; + while (std::regex_search(str, m, re)) { + auto tok = vocab.token_to_id.find(m.str()); + if (tok != vocab.token_to_id.end()) { + auto tokid = tok->second; + auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix()); + out.insert(out.end(), pfxtoks.begin(), pfxtoks.end()); + out.push_back(tokid); + str = m.suffix(); + } + } + if (!str.empty()) { + auto tokrest = gpt_tokenize_inner(vocab, str); + out.insert(out.end(), tokrest.begin(), tokrest.end()); + } + return out; + } else { + return gpt_tokenize_inner(vocab, text); + } +} + + bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); diff --git a/gpt4all-backend/utils.h b/gpt4all-backend/utils.h index 90cfdd97..e51b66b0 100644 --- a/gpt4all-backend/utils.h +++ b/gpt4all-backend/utils.h @@ -44,6 +44,11 @@ struct gpt_vocab { std::map token_to_id; std::map id_to_token; + std::vector special_tokens; + + void add_special_token(const std::string &token) { + special_tokens.push_back(token); + } }; void replace(std::string & str, const std::string & needle, const std::string & replacement);