From dc559c157502422685e08afe84bbfef97b7fcfa4 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 8 May 2023 12:06:32 -0400
Subject: [PATCH] Fix for special tokens.

---
 llmodel/mpt.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llmodel/mpt.cpp b/llmodel/mpt.cpp
index fd905be3..bbd516e2 100644
--- a/llmodel/mpt.cpp
+++ b/llmodel/mpt.cpp
@@ -145,9 +145,16 @@ struct mpt_vocab {
     std::map<id, token> id_to_token;
     std::vector<std::string> special_tokens;
 
-    void add_special_token(const std::string &token);
+    void add_special_token(const std::string &token) {
+        special_tokens.push_back(token);
+    }
 };
 
+std::string regex_escape(const std::string &s) {
+  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
+  return std::regex_replace(s, metacharacters, "\\$&");
+}
+
 // load the model's weights from a stream
 bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, mpt_vocab & vocab) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
@@ -215,6 +222,9 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod
 
             // TODO: this only kind-of works, the gpt_tokenize can still incorrectly
             // tokenize special tokens
+            if(special) {
+                vocab.add_special_token(regex_escape(word));
+            }
         }
     }