feat: build works + tokenizer

This commit is contained in:
Zach Nussbaum 2023-05-06 15:35:02 -04:00 committed by Adam Treat
parent 525b703984
commit 2f6ecbe798

View File

@ -15,11 +15,11 @@
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <unordered_set> #include <unordered_set>
#include <regex>
static const size_t MB = 1024*1024; static const size_t MB = 1024*1024;
struct mpt_hparams { struct mpt_hparams {
// FIXME: for mpt
int32_t n_vocab = 50432; int32_t n_vocab = 50432;
int32_t n_ctx = 2048; int32_t n_ctx = 2048;
int32_t n_embd = 4096; int32_t n_embd = 4096;
@ -150,6 +150,9 @@ struct mpt_vocab {
std::map<token, id> token_to_id; std::map<token, id> token_to_id;
std::map<id, token> id_to_token; std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;
void add_special_token(const std::string &token);
}; };
// load the model's weights from a stream // load the model's weights from a stream
@ -316,6 +319,9 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod
layer.norm_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.norm_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.norm_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.norm_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.norm_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.norm_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
@ -338,11 +344,14 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod
model.tensors["transformer.block." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w; model.tensors["transformer.block." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
model.tensors["transformer.block." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w; model.tensors["transformer.block." + std::to_string(i) + ".mlp.up_proj.weight"] = layer.up_proj_w;
model.tensors["transformer.block." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b; model.tensors["transformer.block." + std::to_string(i) + ".mlp.up_proj.bias"] = layer.up_proj_b;
model.tensors["transformer.block." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w; model.tensors["transformer.block." + std::to_string(i) + ".mlp.down_proj.weight"] = layer.down_proj_w;
model.tensors["transformer.block." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b; model.tensors["transformer.block." + std::to_string(i) + ".mlp.down_proj.bias"] = layer.down_proj_b;
model.tensors["transformer.block." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_g;
model.tensors["transformer.block." + std::to_string(i) + ".norm_2.bias"] = layer.norm_2_b;
} }
// key + value memory // key + value memory
@ -552,7 +561,6 @@ bool mpt_eval(
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
} }
// we need to replace rope with alibi
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q = struct ggml_tensor * Q =
@ -710,15 +718,77 @@ bool mpt_eval(
} }
std::vector<int> mpt_tokenize(const mpt_vocab & vocab, const std::string & text) { std::vector<int> mpt_tokenize(const mpt_vocab & vocab, const std::string & text) {
// FIXME // taken from stablelm example in ggml
return std::vector<int>(); // they both use the gpt-neox tokenizer
// not sure if this entirely right?
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
std::string special_tokens_subpattern;
for (const auto &token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|";
}
special_tokens_subpattern += token;
} }
const std::string mpt_token_to_str(const mpt_vocab & vocab, int token) { // Modify the regex pattern with the generated special tokens subpattern
// FIXME pat = special_tokens_subpattern + "|" + pat;
return std::string();
} }
std::regex re(pat);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
// find the longest tokens that form the words:
std::vector<mpt_vocab::id> tokens;
for (const auto & word : words) {
if (word.size() == 0) continue;
int i = 0;
int n = word.size();
while (i < n) {
int j = n;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
++i;
}
}
}
return tokens;
}
#define MPT_MAX_RNG_STATE 64*1024 #define MPT_MAX_RNG_STATE 64*1024
@ -1127,10 +1197,10 @@ void MPT::prompt(const std::string &prompt,
// display text // display text
++totalPredictions; ++totalPredictions;
if (id == 50256 /*end of text*/) if (id == 0 /*end of text*/)
goto stop_generating; goto stop_generating;
const std::string str = mpt_token_to_str(d_ptr->vocab, id); const std::string str = d_ptr->vocab.id_to_token[id];
// Check if the provided str is part of our reverse prompts // Check if the provided str is part of our reverse prompts
bool foundPartialReversePrompt = false; bool foundPartialReversePrompt = false;
@ -1160,7 +1230,7 @@ void MPT::prompt(const std::string &prompt,
if (promptCtx.tokens.size() == promptCtx.n_ctx) if (promptCtx.tokens.size() == promptCtx.n_ctx)
promptCtx.tokens.erase(promptCtx.tokens.begin()); promptCtx.tokens.erase(promptCtx.tokens.begin());
promptCtx.tokens.push_back(t); promptCtx.tokens.push_back(t);
if (!responseCallback(t, mpt_token_to_str(d_ptr->vocab, t))) if (!responseCallback(t, d_ptr->vocab.id_to_token[t]))
goto stop_generating; goto stop_generating;
} }
cachedTokens.clear(); cachedTokens.clear();