From dc559c157502422685e08afe84bbfef97b7fcfa4 Mon Sep 17 00:00:00 2001 From: Adam Treat Date: Mon, 8 May 2023 12:06:32 -0400 Subject: [PATCH] Fix for special tokens. --- llmodel/mpt.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/llmodel/mpt.cpp b/llmodel/mpt.cpp index fd905be3..bbd516e2 100644 --- a/llmodel/mpt.cpp +++ b/llmodel/mpt.cpp @@ -145,9 +145,16 @@ struct mpt_vocab { std::map id_to_token; std::vector special_tokens; - void add_special_token(const std::string &token); + void add_special_token(const std::string &token) { + special_tokens.push_back(token); + } }; +std::string regex_escape(const std::string &s) { + static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])"); + return std::regex_replace(s, metacharacters, "\\$&"); +} + // load the model's weights from a stream bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, mpt_vocab & vocab) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); @@ -215,6 +222,9 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod // TODO: this only kind-of works, the gpt_tokenize can still incorrectly // tokenize special tokens + if(special) { + vocab.add_special_token(regex_escape(word)); + } } }