From 8c4b8f215fa3da5cc15f6be99f0bfa6fcf0c0660 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 8 May 2023 17:23:02 -0400
Subject: [PATCH] Fix gptj to have lower memory requirements for kv cache and
 add versioning to the internal state to smoothly handle such a fix in the
 future.

---
 chat.cpp          |  6 ++++++
 chatlistmodel.cpp |  2 +-
 chatllm.cpp       | 21 +++++++++++++++++++++
 chatllm.h         |  7 +++++++
 llmodel/gptj.cpp  |  2 +-
 5 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/chat.cpp b/chat.cpp
index 6d1782ff..75cad7eb 100644
--- a/chat.cpp
+++ b/chat.cpp
@@ -202,6 +202,12 @@ bool Chat::deserialize(QDataStream &stream, int version)
     stream >> m_userName;
     emit nameChanged();
     stream >> m_savedModelName;
+
+    // Prior to version 2 gptj models had a bug that fixed the kv_cache to F32 instead of F16 so
+    // unfortunately, we cannot deserialize these
+    if (version < 2 && m_savedModelName.contains("gpt4all-j"))
+        return false;
+
     if (!m_llmodel->deserialize(stream, version))
         return false;
     if (!m_chatModel->deserialize(stream, version))
diff --git a/chatlistmodel.cpp b/chatlistmodel.cpp
index 152d7f6f..1fbd0110 100644
--- a/chatlistmodel.cpp
+++ b/chatlistmodel.cpp
@@ -5,7 +5,7 @@
 #include <QDataStream>
 
 #define CHAT_FORMAT_MAGIC 0xF5D553CC
-#define CHAT_FORMAT_VERSION 1
+#define CHAT_FORMAT_VERSION 2
 
 ChatListModel::ChatListModel(QObject *parent)
     : QAbstractListModel(parent)
diff --git a/chatllm.cpp b/chatllm.cpp
index 196c2be1..7e95bf0b 100644
--- a/chatllm.cpp
+++ b/chatllm.cpp
@@ -16,6 +16,10 @@
 
 //#define DEBUG
 
+#define MPT_INTERNAL_STATE_VERSION 0
+#define GPTJ_INTERNAL_STATE_VERSION 0
+#define LLAMA_INTERNAL_STATE_VERSION 0
+
 static QString modelFilePath(const QString &modelName)
 {
     QString appPath = QCoreApplication::applicationDirPath()
@@ -96,12 +100,15 @@ bool ChatLLM::loadModel(const QString &modelName)
         isGPTJ = magic == 0x67676d6c;
         isMPT = magic == 0x67676d6d;
         if (isGPTJ) {
+            m_modelType = ModelType::GPTJ_;
             m_llmodel = new GPTJ;
             m_llmodel->loadModel(filePath.toStdString());
         } else if (isMPT) {
+            m_modelType = ModelType::MPT_;
             m_llmodel = new MPT;
             m_llmodel->loadModel(filePath.toStdString());
         } else {
+            m_modelType = ModelType::LLAMA_;
             m_llmodel = new LLamaModel;
             m_llmodel->loadModel(filePath.toStdString());
         }
@@ -380,6 +387,15 @@ bool ChatLLM::handleNameRecalculate(bool isRecalc)
 
 bool ChatLLM::serialize(QDataStream &stream, int version)
 {
+    if (version > 1) {
+        stream << m_modelType;
+        switch (m_modelType) {
+        case MPT_: stream << MPT_INTERNAL_STATE_VERSION; break;
+        case GPTJ_: stream << GPTJ_INTERNAL_STATE_VERSION; break;
+        case LLAMA_: stream << LLAMA_INTERNAL_STATE_VERSION; break;
+        default: Q_UNREACHABLE();
+        }
+    }
     stream << response();
     stream << generatedName();
     stream << m_promptResponseTokens;
@@ -400,6 +416,11 @@ bool ChatLLM::serialize(QDataStream &stream, int version)
 
 bool ChatLLM::deserialize(QDataStream &stream, int version)
 {
+    if (version > 1) {
+        int internalStateVersion;
+        stream >> m_modelType;
+        stream >> internalStateVersion; // for future use
+    }
     QString response;
     stream >> response;
     m_response = response.toStdString();
diff --git a/chatllm.h b/chatllm.h
index 8a2732d1..9e0b932f 100644
--- a/chatllm.h
+++ b/chatllm.h
@@ -17,6 +17,12 @@ class ChatLLM : public QObject
     Q_PROPERTY(QString generatedName READ generatedName NOTIFY generatedNameChanged)
 
 public:
+    enum ModelType {
+        MPT_,
+        GPTJ_,
+        LLAMA_
+    };
+
     ChatLLM(Chat *parent);
 
     bool isModelLoaded() const;
@@ -82,6 +88,7 @@ private:
     quint32 m_promptResponseTokens;
     quint32 m_responseLogits;
     QString m_modelName;
+    ModelType m_modelType;
     Chat *m_chat;
     QByteArray m_state;
     QThread m_llmThread;
diff --git a/llmodel/gptj.cpp b/llmodel/gptj.cpp
index a5d04ae7..8e5145f4 100644
--- a/llmodel/gptj.cpp
+++ b/llmodel/gptj.cpp
@@ -352,7 +352,7 @@ bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & m
         const int n_mem      = n_layer*n_ctx;
         const int n_elements = n_embd*n_mem;
 
-        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F32, model.hparams.n_ctx)) {
+        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
             fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
             ggml_free(ctx);
             return false;