From 061d1969f8f199c09ac94a92cd74f59ab827be13 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 31 Jan 2024 14:17:44 -0500
Subject: [PATCH] expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 gpt4all-backend/bert.cpp                      |   6 +-
 gpt4all-backend/bert_impl.h                   |   4 +-
 gpt4all-backend/gptj.cpp                      |   6 +-
 gpt4all-backend/gptj_impl.h                   |   4 +-
 gpt4all-backend/llama.cpp-mainline            |   2 +-
 gpt4all-backend/llamamodel.cpp                | 134 +++++++++++-------
 gpt4all-backend/llamamodel_impl.h             |  19 +--
 gpt4all-backend/llmodel.cpp                   |  55 ++++---
 gpt4all-backend/llmodel.h                     |  25 +++-
 gpt4all-backend/llmodel_c.cpp                 |  17 +--
 gpt4all-backend/llmodel_c.h                   |   6 +-
 .../csharp/Gpt4All/Bindings/LLModel.cs        |   2 +-
 .../csharp/Gpt4All/Bindings/NativeMethods.cs  |   3 +-
 .../Gpt4All/Model/Gpt4AllModelFactory.cs      |   2 +-
 gpt4all-bindings/golang/binding.cpp           |   2 +-
 .../java/com/hexadevlabs/gpt4all/LLModel.java |   2 +-
 .../hexadevlabs/gpt4all/LLModelLibrary.java   |   2 +-
 gpt4all-bindings/python/gpt4all/gpt4all.py    |   6 +-
 gpt4all-bindings/python/gpt4all/pyllmodel.py  |  24 ++--
 gpt4all-bindings/typescript/index.cc          |   4 +-
 gpt4all-chat/chatgpt.cpp                      |   6 +-
 gpt4all-chat/chatgpt.h                        |   4 +-
 gpt4all-chat/chatlistmodel.h                  |   7 +
 gpt4all-chat/chatllm.cpp                      |  11 +-
 gpt4all-chat/embllm.cpp                       |   2 +-
 gpt4all-chat/main.cpp                         |   4 +
 gpt4all-chat/modellist.cpp                    |  49 +++++++
 gpt4all-chat/modellist.h                      |  32 +++--
 gpt4all-chat/mysettings.cpp                   |  23 +++
 gpt4all-chat/mysettings.h                     |   5 +
 gpt4all-chat/qml/ModelSettings.qml            |  70 ++++++++-
 31 files changed, 381 insertions(+), 157 deletions(-)

diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
index e2d21265..01b348d0 100644
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -709,9 +709,10 @@ Bert::~Bert() {
     bert_free(d_ptr->ctx);
 }
 
-bool Bert::loadModel(const std::string &modelPath, int n_ctx)
+bool Bert::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     (void)n_ctx;
+    (void)ngl;
     d_ptr->ctx = bert_load_from_file(modelPath.c_str());
     d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = d_ptr->ctx != nullptr;
@@ -724,10 +725,11 @@ bool Bert::isModelLoaded() const
     return d_ptr->modelLoaded;
 }
 
-size_t Bert::requiredMem(const std::string &modelPath, int n_ctx)
+size_t Bert::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
     (void)modelPath;
     (void)n_ctx;
+    (void)ngl;
     return 0;
 }
 
diff --git a/gpt4all-backend/bert_impl.h b/gpt4all-backend/bert_impl.h
index b39e77e5..072e9783 100644
--- a/gpt4all-backend/bert_impl.h
+++ b/gpt4all-backend/bert_impl.h
@@ -18,9 +18,9 @@ public:
 
     bool supportsEmbedding() const override { return true; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
diff --git a/gpt4all-backend/gptj.cpp b/gpt4all-backend/gptj.cpp
index 6303ed84..40db378a 100644
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@@ -672,8 +672,9 @@ GPTJ::GPTJ()
     d_ptr->modelLoaded = false;
 }
 
-size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx) {
+size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
     (void)n_ctx;
+    (void)ngl;
     gptj_model dummy_model;
     gpt_vocab dummy_vocab;
     size_t mem_req;
@@ -681,8 +682,9 @@ size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx) {
     return mem_req;
 }
 
-bool GPTJ::loadModel(const std::string &modelPath, int n_ctx) {
+bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
     (void)n_ctx;
+    (void)ngl;
     std::mt19937 rng(time(NULL));
     d_ptr->rng = rng;
 
diff --git a/gpt4all-backend/gptj_impl.h b/gpt4all-backend/gptj_impl.h
index c2100b24..01d5698f 100644
--- a/gpt4all-backend/gptj_impl.h
+++ b/gpt4all-backend/gptj_impl.h
@@ -17,9 +17,9 @@ public:
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index 28921b84..997a7339 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit 28921b84e4547c42fd7d23615d92c9d894a6cc2d
+Subproject commit 997a733992d38fe5851f68f4e69cf24fa983e8f1
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index 4152dd19..1fab5138 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -32,6 +32,9 @@
 #include "ggml-kompute.h"
 #endif
 
+// Maximum supported GGUF version
+static constexpr int GGUF_VER_MAX = 3;
+
 namespace {
 const char *modelType_ = "LLaMA";
 }
@@ -121,8 +124,9 @@ struct llama_file_hparams {
     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 };
 
-size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
+size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
     // TODO(cebtenzzre): update to GGUF
+    (void)ngl; // FIXME(cetenzzre): use this value
     auto fin = std::ifstream(modelPath, std::ios::binary);
     fin.seekg(0, std::ios_base::end);
     size_t filesize = fin.tellg();
@@ -144,7 +148,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx) {
     return filesize + est_kvcache_size;
 }
 
-bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
+bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     gpt_params params;
 
@@ -168,11 +172,14 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx)
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
     }
+
+    // always fully offload on Metal
+    // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
     d_ptr->model_params.n_gpu_layers = 100;
 #elif defined(GGML_USE_KOMPUTE)
     if (d_ptr->device != -1) {
         d_ptr->model_params.main_gpu = d_ptr->device;
-        d_ptr->model_params.n_gpu_layers = 100;
+        d_ptr->model_params.n_gpu_layers = ngl;
     }
 #endif
 
@@ -323,13 +330,70 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
     return d_ptr->end_tokens;
 }
 
-#if defined(GGML_USE_KOMPUTE)
-#include "ggml-kompute.h"
-#endif
+std::string get_arch_name(gguf_context *ctx_gguf) {
+    std::string arch_name;
+    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
+    if (ktype != (GGUF_TYPE_STRING)) {
+        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
+    }
+    return gguf_get_val_str(ctx_gguf, kid);
+}
 
-std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
+static gguf_context *load_gguf(const char *fname, std::string &arch) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ nullptr,
+    };
+    gguf_context *ctx = gguf_init_from_file(fname, params);
+    if (!ctx) {
+        std::cerr << __func__ << ": gguf_init_from_file failed\n";
+        return nullptr;
+    }
+
+    int gguf_ver = gguf_get_version(ctx);
+    if (gguf_ver > GGUF_VER_MAX) {
+        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
+        gguf_free(ctx);
+        return nullptr;
+    }
+
+    arch = get_arch_name(ctx);
+    return ctx;
+}
+
+static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
+    std::string arch;
+    auto * ctx = load_gguf(modelPath.c_str(), arch);
+
+    int32_t value = -1;
+    if (ctx) {
+        auto key = arch + "." + archKey;
+        int keyidx = gguf_find_key(ctx, key.c_str());
+        if (keyidx != -1) {
+            value = gguf_get_val_u32(ctx, keyidx);
+        } else {
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
+        }
+    }
+
+    gguf_free(ctx);
+    return value;
+}
+
+int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
 {
-#if defined(GGML_USE_KOMPUTE)
+    return get_arch_key_u32(modelPath, "context_length");
+}
+
+int32_t LLamaModel::layerCount(std::string const &modelPath) const
+{
+    return get_arch_key_u32(modelPath, "block_count");
+}
+
+std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired) const
+{
+#ifdef GGML_USE_KOMPUTE
     size_t count = 0;
     auto * vkDevices = ggml_vk_available_devices(memoryRequired, &count);
 
@@ -346,6 +410,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
                 /* name     = */ dev.name,
                 /* vendor   = */ dev.vendor
             );
+            ggml_vk_device_destroy(&dev);
         }
 
         free(vkDevices);
@@ -356,7 +421,7 @@ std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryReq
     return {};
 }
 
-bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name)
+bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &name) const
 {
 #if defined(GGML_USE_KOMPUTE)
     ggml_vk_device device;
@@ -372,11 +437,11 @@ bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string &n
     return false;
 }
 
-bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::string *unavail_reason)
+bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) const
 {
 #if defined(GGML_USE_KOMPUTE)
     (void)unavail_reason;
-    d_ptr->device = device.index;
+    d_ptr->device = device;
     return true;
 #else
     (void)device;
@@ -387,17 +452,6 @@ bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device, std::stri
 #endif
 }
 
-bool LLamaModel::initializeGPUDevice(int device)
-{
-#if defined(GGML_USE_KOMPUTE)
-    d_ptr->device = device;
-    return true;
-#else
-    (void)device;
-    return false;
-#endif
-}
-
 bool LLamaModel::hasGPUDevice()
 {
 #if defined(GGML_USE_KOMPUTE)
@@ -418,16 +472,6 @@ bool LLamaModel::usingGPUDevice()
 #endif
 }
 
-std::string get_arch_name(gguf_context *ctx_gguf) {
-    std::string arch_name;
-    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
-    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
-    if (ktype != (GGUF_TYPE_STRING)) {
-        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
-    }
-    return gguf_get_val_str(ctx_gguf, kid);
-}
-
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
@@ -447,35 +491,19 @@ DLL_EXPORT const char *get_build_variant() {
     return GGML_BUILD_VARIANT;
 }
 
-DLL_EXPORT bool magic_match(const char * fname) {
-    struct ggml_context * ctx_meta = NULL;
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ &ctx_meta,
-    };
-    gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
-    if (!ctx_gguf) {
-        std::cerr << __func__ << ": gguf_init_from_file failed\n";
-        return false;
-    }
+DLL_EXPORT bool magic_match(const char *fname) {
+    std::string arch;
+    auto * ctx = load_gguf(fname, arch);
 
     bool valid = true;
-
-    int gguf_ver = gguf_get_version(ctx_gguf);
-    if (valid && gguf_ver > 3) {
-        std::cerr << __func__ << ": unsupported gguf version: " << gguf_ver << "\n";
-        valid = false;
-    }
-
-    auto arch = get_arch_name(ctx_gguf);
-    if (valid && !(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
+    if (!(arch == "llama" || arch == "starcoder" || arch == "falcon" || arch == "mpt")) {
         if (!(arch == "gptj" || arch == "bert")) { // we support these via other modules
             std::cerr << __func__ << ": unsupported model architecture: " << arch << "\n";
         }
         valid = false;
     }
 
-    gguf_free(ctx_gguf);
+    gguf_free(ctx);
     return valid;
 }
 
diff --git a/gpt4all-backend/llamamodel_impl.h b/gpt4all-backend/llamamodel_impl.h
index 7c097637..27eb580b 100644
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@@ -4,8 +4,9 @@
 #ifndef LLAMAMODEL_H
 #define LLAMAMODEL_H
 
-#include <string>
 #include <functional>
+#include <memory>
+#include <string>
 #include <vector>
 #include "llmodel.h"
 
@@ -17,23 +18,22 @@ public:
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
     void setThreadCount(int32_t n_threads) override;
     int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
-    bool initializeGPUDevice(size_t memoryRequired, const std::string& name) override;
-    bool initializeGPUDevice(const GPUDevice &device, std::string *unavail_reason) override;
-    bool initializeGPUDevice(int device) override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
+    bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const override;
+    bool initializeGPUDevice(int device, std::string *unavail_reason) const override;
     bool hasGPUDevice() override;
     bool usingGPUDevice() override;
 
 private:
-    LLamaPrivate *d_ptr;
+    std::unique_ptr<LLamaPrivate> d_ptr;
 
 protected:
     std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
@@ -42,6 +42,9 @@ protected:
     bool evalTokens(PromptContext& ctx, const std::vector<int32_t> &tokens) const override;
     int32_t contextLength() const override;
     const std::vector<Token>& endTokens() const override;
+
+    int32_t maxContextLength(std::string const &modelPath) const override;
+    int32_t layerCount(std::string const &modelPath) const override;
 };
 
 #endif // LLAMAMODEL_H
diff --git a/gpt4all-backend/llmodel.cpp b/gpt4all-backend/llmodel.cpp
index bc9176f0..506b2c06 100644
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@@ -2,15 +2,17 @@
 #include "dlhandle.h"
 #include "sysinfo.h"
 
-#include <iostream>
-#include <string>
-#include <vector>
-#include <fstream>
-#include <filesystem>
 #include <cassert>
 #include <cstdlib>
-#include <sstream>
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <memory>
 #include <regex>
+#include <sstream>
+#include <string>
+#include <vector>
+
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
@@ -158,7 +160,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
                  * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
                  * most (all?) places where this is called, causing underestimation of required
                  * memory. */
-                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx);
+                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
                 float req_to_total = (float) req_mem / (float) total_mem;
                 // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                 if (req_to_total >= 0.53) {
@@ -193,26 +195,39 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
 }
 
 LLModel *LLModel::Implementation::constructDefaultLlama() {
-    const LLModel::Implementation *impl = nullptr;
-    for (const auto &i : implementationList()) {
-        if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
-        impl = &i;
-    }
-    if (!impl) {
-        std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
-        return nullptr;
-    }
-    auto fres = impl->m_construct();
-    fres->m_implementation = impl;
-    return fres;
+    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
+        const LLModel::Implementation *impl = nullptr;
+        for (const auto &i : implementationList()) {
+            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
+            impl = &i;
+        }
+        if (!impl) {
+            std::cerr << "LLModel ERROR: Could not find CPU LLaMA implementation\n";
+            return nullptr;
+        }
+        auto fres = impl->m_construct();
+        fres->m_implementation = impl;
+        return fres;
+    }());
+    return llama.get();
 }
 
 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
-    static LLModel *llama = LLModel::Implementation::constructDefaultLlama(); // (memory leak)
+    auto * llama = constructDefaultLlama();
     if (llama) { return llama->availableGPUDevices(0); }
     return {};
 }
 
+int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
+    auto * llama = constructDefaultLlama();
+    return llama ? llama->maxContextLength(modelPath) : -1;
+}
+
+int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
+    auto * llama = constructDefaultLlama();
+    return llama ? llama->layerCount(modelPath) : -1;
+}
+
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
     s_implementations_search_path = path;
 }
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
index a5ae2d54..7fc5e71d 100644
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@@ -42,6 +42,8 @@ public:
         static const Implementation *implementation(const char *fname, const std::string& buildVariant);
         static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
         static std::vector<GPUDevice> availableGPUDevices();
+        static int32_t maxContextLength(const std::string &modelPath);
+        static int32_t layerCount(const std::string &modelPath);
         static void setImplementationsSearchPath(const std::string& path);
         static const std::string& implementationsSearchPath();
 
@@ -77,9 +79,9 @@ public:
 
     virtual bool supportsEmbedding() const = 0;
     virtual bool supportsCompletion() const = 0;
-    virtual bool loadModel(const std::string &modelPath, int n_ctx) = 0;
+    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
     virtual bool isModelLoaded() const = 0;
-    virtual size_t requiredMem(const std::string &modelPath, int n_ctx) = 0;
+    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
     virtual size_t stateSize() const { return 0; }
     virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
     virtual size_t restoreState(const uint8_t */*src*/) { return 0; }
@@ -101,18 +103,18 @@ public:
         return *m_implementation;
     }
 
-    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
+    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
         (void)memoryRequired;
         return {};
     }
 
-    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
+    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const {
         (void)memoryRequired;
         (void)name;
         return false;
     }
 
-    virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
+    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
         (void)device;
         if (unavail_reason) {
             *unavail_reason = "model has no GPU support";
@@ -120,7 +122,6 @@ public:
         return false;
     }
 
-    virtual bool initializeGPUDevice(int /*device*/) { return false; }
     virtual bool hasGPUDevice() { return false; }
     virtual bool usingGPUDevice() { return false; }
 
@@ -134,6 +135,18 @@ protected:
     virtual int32_t contextLength() const = 0;
     virtual const std::vector<Token>& endTokens() const = 0;
 
+    virtual int32_t maxContextLength(std::string const &modelPath) const
+    {
+        (void)modelPath;
+        return -1;
+    }
+
+    virtual int32_t layerCount(std::string const &modelPath) const
+    {
+        (void)modelPath;
+        return -1;
+    }
+
     // This is a helper function called from the default implementation of 'prompt' but it can be
     // shared by all base classes so it isn't virtual
     void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
diff --git a/gpt4all-backend/llmodel_c.cpp b/gpt4all-backend/llmodel_c.cpp
index bfeca69c..8ba59b2b 100644
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@@ -47,16 +47,16 @@ void llmodel_model_destroy(llmodel_model model) {
     delete reinterpret_cast<LLModelWrapper*>(model);
 }
 
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx)
+size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->requiredMem(model_path, n_ctx);
+    return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
 }
 
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx)
+bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
 {
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->loadModel(model_path, n_ctx);
+    return wrapper->llModel->loadModel(model_path, n_ctx, ngl);
 }
 
 bool llmodel_isModelLoaded(llmodel_model model)
@@ -230,15 +230,8 @@ bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryReq
 
 bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
 {
-    LLModel::GPUDevice d(
-        /* index    = */ device->index,
-        /* type     = */ device->type,
-        /* heapSize = */ device->heapSize,
-        /* name     = */ device->name,
-        /* vendor   = */ device->vendor
-    );
     LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
-    return wrapper->llModel->initializeGPUDevice(d);
+    return wrapper->llModel->initializeGPUDevice(device->index);
 }
 
 bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
diff --git a/gpt4all-backend/llmodel_c.h b/gpt4all-backend/llmodel_c.h
index dcd53f2e..50d35eda 100644
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@@ -111,18 +111,20 @@ void llmodel_model_destroy(llmodel_model model);
  * @param model A pointer to the llmodel_model instance.
  * @param model_path A string representing the path to the model file.
  * @param n_ctx Maximum size of context window
+ * @param ngl Number of GPU layers to use (Vulkan)
  * @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
  */
-size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx);
+size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl);
 
 /**
  * Load a model from a file.
  * @param model A pointer to the llmodel_model instance.
  * @param model_path A string representing the path to the model file.
  * @param n_ctx Maximum size of context window
+ * @param ngl Number of GPU layers to use (Vulkan)
  * @return true if the model was loaded successfully, false otherwise.
  */
-bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx);
+bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl);
 
 /**
  * Check if a model is loaded.
diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
index 04fbbc5b..a56b38a5 100644
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
@@ -183,7 +183,7 @@ public class LLModel : ILLModel
     /// <returns>true if the model was loaded successfully, false otherwise.</returns>
     public bool Load(string modelPath)
     {
-        return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048);
+        return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048, 100);
     }
 
     protected void Destroy()
diff --git a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
index c6ea9e11..7ac955c5 100644
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
@@ -71,7 +71,8 @@ internal static unsafe partial class NativeMethods
     public static extern bool llmodel_loadModel(
         [NativeTypeName("llmodel_model")] IntPtr model,
         [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
-        [NativeTypeName("int32_t")] int n_ctx);
+        [NativeTypeName("int32_t")] int n_ctx,
+        [NativeTypeName("int32_t")] int ngl);
 
     [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
 
diff --git a/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs b/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
index 5e668dc2..8350a66a 100644
--- a/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
@@ -43,7 +43,7 @@ public class Gpt4AllModelFactory : IGpt4AllModelFactory
         }
         _logger.LogDebug("Model created handle=0x{ModelHandle:X8}", handle);
         _logger.LogInformation("Model loading started");
-        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048);
+        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048, 100);
         _logger.LogInformation("Model loading completed success={ModelLoadSuccess}", loadedSuccessfully);
         if (!loadedSuccessfully)
         {
diff --git a/gpt4all-bindings/golang/binding.cpp b/gpt4all-bindings/golang/binding.cpp
index 1ccb9e61..0026d865 100644
--- a/gpt4all-bindings/golang/binding.cpp
+++ b/gpt4all-bindings/golang/binding.cpp
@@ -23,7 +23,7 @@ void* load_model(const char *fname, int n_threads) {
         fprintf(stderr, "%s: error '%s'\n", __func__, new_error);
         return nullptr;
     }
-    if (!llmodel_loadModel(model, fname, 2048)) {
+    if (!llmodel_loadModel(model, fname, 2048, 100)) {
         llmodel_model_destroy(model);
         return nullptr;
     }
diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java
index 6c0d053e..769de02a 100644
--- a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java
+++ b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModel.java
@@ -195,7 +195,7 @@ public  class LLModel implements AutoCloseable {
         if(model == null) {
             throw new IllegalStateException("Could not load, gpt4all backend returned error: " + error.getValue().getString(0));
         }
-        library.llmodel_loadModel(model, modelPathAbs, 2048);
+        library.llmodel_loadModel(model, modelPathAbs, 2048, 100);
 
         if(!library.llmodel_isModelLoaded(model)){
             throw new IllegalStateException("The model " + modelName + " could not be loaded");
diff --git a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java
index b2d48e34..356b6149 100644
--- a/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java
+++ b/gpt4all-bindings/java/src/main/java/com/hexadevlabs/gpt4all/LLModelLibrary.java
@@ -61,7 +61,7 @@ public interface LLModelLibrary {
 
     Pointer llmodel_model_create2(String model_path, String build_variant, PointerByReference error);
     void llmodel_model_destroy(Pointer model);
-    boolean llmodel_loadModel(Pointer model, String model_path, int n_ctx);
+    boolean llmodel_loadModel(Pointer model, String model_path, int n_ctx, int ngl);
     boolean llmodel_isModelLoaded(Pointer model);
     @u_int64_t long llmodel_get_state_size(Pointer model);
     @u_int64_t long llmodel_save_state_data(Pointer model, Pointer dest);
diff --git a/gpt4all-bindings/python/gpt4all/gpt4all.py b/gpt4all-bindings/python/gpt4all/gpt4all.py
index 50c3c88e..fcd0a91d 100644
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -70,6 +70,7 @@ class GPT4All:
         n_threads: Optional[int] = None,
         device: Optional[str] = "cpu",
         n_ctx: int = 2048,
+        ngl: int = 100,
         verbose: bool = False,
     ):
         """
@@ -92,6 +93,7 @@ class GPT4All:
 
                 Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
             n_ctx: Maximum size of context window
+            ngl: Number of GPU layers to use (Vulkan)
             verbose: If True, print debug messages.
         """
         self.model_type = model_type
@@ -99,8 +101,8 @@ class GPT4All:
         # Retrieve model and download if allowed
         self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
         if device is not None and device != "cpu":
-            self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx)
-        self.model.load_model(self.config["path"], n_ctx)
+            self.model.init_gpu(model_path=self.config["path"], device=device, n_ctx=n_ctx, ngl=ngl)
+        self.model.load_model(self.config["path"], n_ctx, ngl)
         # Set n_threads
         if n_threads is not None:
             self.model.set_thread_count(n_threads)
diff --git a/gpt4all-bindings/python/gpt4all/pyllmodel.py b/gpt4all-bindings/python/gpt4all/pyllmodel.py
index f3a1ee8e..f313e305 100644
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py
@@ -182,16 +182,16 @@ class LLModel:
         if self.model is not None:
             self.llmodel_lib.llmodel_model_destroy(self.model)
 
-    def memory_needed(self, model_path: str, n_ctx: int) -> int:
+    def memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
         self.model = None
-        return self._memory_needed(model_path, n_ctx)
+        return self._memory_needed(model_path, n_ctx, ngl)
 
-    def _memory_needed(self, model_path: str, n_ctx: int) -> int:
+    def _memory_needed(self, model_path: str, n_ctx: int, ngl: int) -> int:
         if self.model is None:
             self.model = _create_model(model_path.encode())
-        return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx)
+        return llmodel.llmodel_required_mem(self.model, model_path.encode(), n_ctx, ngl)
 
-    def list_gpu(self, model_path: str, n_ctx: int) -> list[LLModelGPUDevice]:
+    def list_gpu(self, model_path: str, n_ctx: int, ngl: int) -> list[LLModelGPUDevice]:
         """
         Lists available GPU devices that satisfy the model's memory requirements.
 
@@ -201,13 +201,15 @@ class LLModel:
             Path to the model.
         n_ctx : int
             Maximum size of context window
+        ngl : int
+            Number of GPU layers to use (Vulkan)
 
         Returns
         -------
         list
             A list of LLModelGPUDevice structures representing available GPU devices.
         """
-        mem_required = self._memory_needed(model_path, n_ctx)
+        mem_required = self._memory_needed(model_path, n_ctx, ngl)
         return self._list_gpu(mem_required)
 
     def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
@@ -217,8 +219,8 @@ class LLModel:
             raise ValueError("Unable to retrieve available GPU devices")
         return devices_ptr[:num_devices.value]
 
-    def init_gpu(self, model_path: str, device: str, n_ctx: int):
-        mem_required = self._memory_needed(model_path, n_ctx)
+    def init_gpu(self, model_path: str, device: str, n_ctx: int, ngl: int):
+        mem_required = self._memory_needed(model_path, n_ctx, ngl)
 
         success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode())
         if not success:
@@ -241,7 +243,7 @@ class LLModel:
             error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
             raise ValueError(error_msg)
 
-    def load_model(self, model_path: str, n_ctx: int) -> bool:
+    def load_model(self, model_path: str, n_ctx: int, ngl: int) -> bool:
         """
         Load model from a file.
 
@@ -251,6 +253,8 @@ class LLModel:
             Model filepath
         n_ctx : int
             Maximum size of context window
+        ngl : int
+            Number of GPU layers to use (Vulkan)
 
         Returns
         -------
@@ -258,7 +262,7 @@ class LLModel:
         """
         self.model = _create_model(model_path.encode())
 
-        llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx)
+        llmodel.llmodel_loadModel(self.model, model_path.encode(), n_ctx, ngl)
 
         filename = os.path.basename(model_path)
         self.model_name = os.path.splitext(filename)[0]
diff --git a/gpt4all-bindings/typescript/index.cc b/gpt4all-bindings/typescript/index.cc
index 6a69ad49..31a3e3c4 100644
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@@ -28,7 +28,7 @@ Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
 Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info) 
 {
     auto env = info.Env();
-    return Napi::Number::New(env, static_cast<uint32_t>( llmodel_required_mem(GetInference(), full_model_path.c_str(), 2048) ));
+    return Napi::Number::New(env, static_cast<uint32_t>( llmodel_required_mem(GetInference(), full_model_path.c_str(), 2048, 100) ));
 
 }
   Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo& info) 
@@ -161,7 +161,7 @@ Napi::Value NodeModelWrapper::GetRequiredMemory(const Napi::CallbackInfo& info)
         }
     }
 
-    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), 2048);
+    auto success = llmodel_loadModel(GetInference(), full_weight_path.c_str(), 2048, 100);
     if(!success) {
         Napi::Error::New(env, "Failed to load model at given path").ThrowAsJavaScriptException(); 
         return;
diff --git a/gpt4all-chat/chatgpt.cpp b/gpt4all-chat/chatgpt.cpp
index 98d241dd..5f3da91d 100644
--- a/gpt4all-chat/chatgpt.cpp
+++ b/gpt4all-chat/chatgpt.cpp
@@ -20,17 +20,19 @@ ChatGPT::ChatGPT()
 {
 }
 
-size_t ChatGPT::requiredMem(const std::string &modelPath, int n_ctx)
+size_t ChatGPT::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
 {
     Q_UNUSED(modelPath);
     Q_UNUSED(n_ctx);
+    Q_UNUSED(ngl);
     return 0;
 }
 
-bool ChatGPT::loadModel(const std::string &modelPath, int n_ctx)
+bool ChatGPT::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 {
     Q_UNUSED(modelPath);
     Q_UNUSED(n_ctx);
+    Q_UNUSED(ngl);
     return true;
 }
 
diff --git a/gpt4all-chat/chatgpt.h b/gpt4all-chat/chatgpt.h
index 7bb3912f..11d84606 100644
--- a/gpt4all-chat/chatgpt.h
+++ b/gpt4all-chat/chatgpt.h
@@ -48,9 +48,9 @@ public:
 
     bool supportsEmbedding() const override { return false; }
     bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx) override;
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
     bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx) override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
     size_t stateSize() const override;
     size_t saveState(uint8_t *dest) const override;
     size_t restoreState(const uint8_t *src) override;
diff --git a/gpt4all-chat/chatlistmodel.h b/gpt4all-chat/chatlistmodel.h
index 20729204..24a95c3d 100644
--- a/gpt4all-chat/chatlistmodel.h
+++ b/gpt4all-chat/chatlistmodel.h
@@ -192,6 +192,13 @@ public:
 
     int count() const { return m_chats.size(); }
 
+    void clearChats() {
+        m_newChat = nullptr;
+        m_serverChat = nullptr;
+        m_currentChat = nullptr;
+        m_chats.clear();
+    }
+
     void removeChatFile(Chat *chat) const;
     Q_INVOKABLE void saveChats();
     void restoreChat(Chat *chat);
diff --git a/gpt4all-chat/chatllm.cpp b/gpt4all-chat/chatllm.cpp
index ad13a81a..eac7b0c2 100644
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@@ -247,10 +247,9 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
             model->setAPIKey(apiKey);
             m_llModelInfo.model = model;
         } else {
-
-            // TODO: make configurable in UI
             auto n_ctx = MySettings::globalInstance()->modelContextLength(modelInfo);
             m_ctx.n_ctx = n_ctx;
+            auto ngl = MySettings::globalInstance()->modelGpuLayers(modelInfo);
 
             std::string buildVariant = "auto";
 #if defined(Q_OS_MAC) && defined(__arm__)
@@ -269,7 +268,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                 if (requestedDevice == "CPU") {
                     emit reportFallbackReason(""); // fallback not applicable
                 } else {
-                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx);
+                    const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString(), n_ctx, ngl);
                     std::vector<LLModel::GPUDevice> availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
                     LLModel::GPUDevice *device = nullptr;
 
@@ -288,7 +287,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                     std::string unavail_reason;
                     if (!device) {
                         // GPU not available
-                    } else if (!m_llModelInfo.model->initializeGPUDevice(*device, &unavail_reason)) {
+                    } else if (!m_llModelInfo.model->initializeGPUDevice(device->index, &unavail_reason)) {
                         emit reportFallbackReason(QString::fromStdString("<br>" + unavail_reason));
                     } else {
                         actualDevice = QString::fromStdString(device->name);
@@ -298,14 +297,14 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                 // Report which device we're actually using
                 emit reportDevice(actualDevice);
 
-                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                bool success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, ngl);
                 if (actualDevice == "CPU") {
                     // we asked llama.cpp to use the CPU
                 } else if (!success) {
                     // llama_init_from_file returned nullptr
                     emit reportDevice("CPU");
                     emit reportFallbackReason("<br>GPU loading failed (out of VRAM?)");
-                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx);
+                    success = m_llModelInfo.model->loadModel(filePath.toStdString(), n_ctx, 0);
                 } else if (!m_llModelInfo.model->usingGPUDevice()) {
                     // ggml_vk_init was not called in llama.cpp
                     // We might have had to fallback to CPU after load if the model is not possible to accelerate
diff --git a/gpt4all-chat/embllm.cpp b/gpt4all-chat/embllm.cpp
index 7be2d348..bc7b2ef9 100644
--- a/gpt4all-chat/embllm.cpp
+++ b/gpt4all-chat/embllm.cpp
@@ -30,7 +30,7 @@ bool EmbeddingLLM::loadModel()
     }
 
     m_model = LLModel::Implementation::construct(filePath.toStdString());
-    bool success = m_model->loadModel(filePath.toStdString(), 2048);
+    bool success = m_model->loadModel(filePath.toStdString(), 2048, 0);
     if (!success) {
         qWarning() << "WARNING: Could not load sbert";
         delete m_model;
diff --git a/gpt4all-chat/main.cpp b/gpt4all-chat/main.cpp
index 1b8cb353..d015581a 100644
--- a/gpt4all-chat/main.cpp
+++ b/gpt4all-chat/main.cpp
@@ -63,5 +63,9 @@ int main(int argc, char *argv[])
     }
 #endif
 
+    // Make sure ChatLLM threads are joined before global destructors run.
+    // Otherwise, we can get a heap-use-after-free inside of llama.cpp.
+    ChatListModel::globalInstance()->clearChats();
+
     return app.exec();
 }
diff --git a/gpt4all-chat/modellist.cpp b/gpt4all-chat/modellist.cpp
index bb688361..97403594 100644
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@@ -1,6 +1,7 @@
 #include "modellist.h"
 #include "mysettings.h"
 #include "network.h"
+#include "../gpt4all-backend/llmodel.h"
 
 #include <QFile>
 #include <QStandardPaths>
@@ -108,6 +109,41 @@ void ModelInfo::setContextLength(int l)
     m_contextLength = l;
 }
 
+int ModelInfo::maxContextLength() const
+{
+    if (m_maxContextLength != -1) return m_maxContextLength;
+    auto path = (dirpath + filename()).toStdString();
+    int layers = LLModel::Implementation::maxContextLength(path);
+    if (layers < 0) {
+        layers = 4096; // fallback value
+    }
+    m_maxContextLength = layers;
+    return m_maxContextLength;
+}
+
+int ModelInfo::gpuLayers() const
+{
+    return MySettings::globalInstance()->modelGpuLayers(*this);
+}
+
+void ModelInfo::setGpuLayers(int l)
+{
+    if (isClone) MySettings::globalInstance()->setModelGpuLayers(*this, l, isClone /*force*/);
+    m_gpuLayers = l;
+}
+
+int ModelInfo::maxGpuLayers() const
+{
+    if (m_maxGpuLayers != -1) return m_maxGpuLayers;
+    auto path = (dirpath + filename()).toStdString();
+    int layers = LLModel::Implementation::layerCount(path);
+    if (layers < 0) {
+        layers = 100; // fallback value
+    }
+    m_maxGpuLayers = layers;
+    return m_maxGpuLayers;
+}
+
 double ModelInfo::repeatPenalty() const
 {
     return MySettings::globalInstance()->modelRepeatPenalty(*this);
@@ -286,6 +322,7 @@ ModelList::ModelList()
     connect(MySettings::globalInstance(), &MySettings::maxLengthChanged, this, &ModelList::updateDataForSettings);
     connect(MySettings::globalInstance(), &MySettings::promptBatchSizeChanged, this, &ModelList::updateDataForSettings);
     connect(MySettings::globalInstance(), &MySettings::contextLengthChanged, this, &ModelList::updateDataForSettings);
+    connect(MySettings::globalInstance(), &MySettings::gpuLayersChanged, this, &ModelList::updateDataForSettings);
     connect(MySettings::globalInstance(), &MySettings::repeatPenaltyChanged, this, &ModelList::updateDataForSettings);
     connect(MySettings::globalInstance(), &MySettings::repeatPenaltyTokensChanged, this, &ModelList::updateDataForSettings);;
     connect(MySettings::globalInstance(), &MySettings::promptTemplateChanged, this, &ModelList::updateDataForSettings);
@@ -539,6 +576,8 @@ QVariant ModelList::dataInternal(const ModelInfo *info, int role) const
             return info->promptBatchSize();
         case ContextLengthRole:
             return info->contextLength();
+        case GpuLayersRole:
+            return info->gpuLayers();
         case RepeatPenaltyRole:
             return info->repeatPenalty();
         case RepeatPenaltyTokensRole:
@@ -664,6 +703,10 @@ void ModelList::updateData(const QString &id, int role, const QVariant &value)
             info->setMaxLength(value.toInt()); break;
         case PromptBatchSizeRole:
             info->setPromptBatchSize(value.toInt()); break;
+        case ContextLengthRole:
+            info->setContextLength(value.toInt()); break;
+        case GpuLayersRole:
+            info->setGpuLayers(value.toInt()); break;
         case RepeatPenaltyRole:
             info->setRepeatPenalty(value.toDouble()); break;
         case RepeatPenaltyTokensRole:
@@ -755,6 +798,7 @@ QString ModelList::clone(const ModelInfo &model)
     updateData(id, ModelList::MaxLengthRole, model.maxLength());
     updateData(id, ModelList::PromptBatchSizeRole, model.promptBatchSize());
     updateData(id, ModelList::ContextLengthRole, model.contextLength());
+    updateData(id, ModelList::GpuLayersRole, model.contextLength());
     updateData(id, ModelList::RepeatPenaltyRole, model.repeatPenalty());
     updateData(id, ModelList::RepeatPenaltyTokensRole, model.repeatPenaltyTokens());
     updateData(id, ModelList::PromptTemplateRole, model.promptTemplate());
@@ -1123,6 +1167,8 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save)
             updateData(id, ModelList::PromptBatchSizeRole, obj["promptBatchSize"].toInt());
         if (obj.contains("contextLength"))
             updateData(id, ModelList::ContextLengthRole, obj["contextLength"].toInt());
+        if (obj.contains("gpuLayers"))
+            updateData(id, ModelList::GpuLayersRole, obj["gpuLayers"].toInt());
         if (obj.contains("repeatPenalty"))
             updateData(id, ModelList::RepeatPenaltyRole, obj["repeatPenalty"].toDouble());
         if (obj.contains("repeatPenaltyTokens"))
@@ -1217,6 +1263,8 @@ void ModelList::updateModelsFromSettings()
         const int promptBatchSize = settings.value(g + "/promptBatchSize").toInt();
         Q_ASSERT(settings.contains(g + "/contextLength"));
         const int contextLength = settings.value(g + "/contextLength").toInt();
+        Q_ASSERT(settings.contains(g + "/gpuLayers"));
+        const int gpuLayers = settings.value(g + "/gpuLayers").toInt();
         Q_ASSERT(settings.contains(g + "/repeatPenalty"));
         const double repeatPenalty = settings.value(g + "/repeatPenalty").toDouble();
         Q_ASSERT(settings.contains(g + "/repeatPenaltyTokens"));
@@ -1236,6 +1284,7 @@ void ModelList::updateModelsFromSettings()
         updateData(id, ModelList::MaxLengthRole, maxLength);
         updateData(id, ModelList::PromptBatchSizeRole, promptBatchSize);
         updateData(id, ModelList::ContextLengthRole, contextLength);
+        updateData(id, ModelList::GpuLayersRole, gpuLayers);
         updateData(id, ModelList::RepeatPenaltyRole, repeatPenalty);
         updateData(id, ModelList::RepeatPenaltyTokensRole, repeatPenaltyTokens);
         updateData(id, ModelList::PromptTemplateRole, promptTemplate);
diff --git a/gpt4all-chat/modellist.h b/gpt4all-chat/modellist.h
index c3145407..475d6a40 100644
--- a/gpt4all-chat/modellist.h
+++ b/gpt4all-chat/modellist.h
@@ -40,6 +40,9 @@ struct ModelInfo {
     Q_PROPERTY(int maxLength READ maxLength WRITE setMaxLength)
     Q_PROPERTY(int promptBatchSize READ promptBatchSize WRITE setPromptBatchSize)
     Q_PROPERTY(int contextLength READ contextLength WRITE setContextLength)
+    Q_PROPERTY(int maxContextLength READ maxContextLength)
+    Q_PROPERTY(int gpuLayers READ gpuLayers WRITE setGpuLayers)
+    Q_PROPERTY(int maxGpuLayers READ maxGpuLayers)
     Q_PROPERTY(double repeatPenalty READ repeatPenalty WRITE setRepeatPenalty)
     Q_PROPERTY(int repeatPenaltyTokens READ repeatPenaltyTokens WRITE setRepeatPenaltyTokens)
     Q_PROPERTY(QString promptTemplate READ promptTemplate WRITE setPromptTemplate)
@@ -97,6 +100,10 @@ public:
     void setPromptBatchSize(int s);
     int contextLength() const;
     void setContextLength(int l);
+    int maxContextLength() const;
+    int gpuLayers() const;
+    void setGpuLayers(int l);
+    int maxGpuLayers() const;
     double repeatPenalty() const;
     void setRepeatPenalty(double p);
     int repeatPenaltyTokens() const;
@@ -110,16 +117,19 @@ private:
     QString m_id;
     QString m_name;
     QString m_filename;
-    double  m_temperature         = 0.7;
-    double  m_topP                = 0.4;
-    int     m_topK                = 40;
-    int     m_maxLength           = 4096;
-    int     m_promptBatchSize     = 128;
-    int     m_contextLength       = 2048;
-    double  m_repeatPenalty       = 1.18;
-    int     m_repeatPenaltyTokens = 64;
-    QString m_promptTemplate      = "### Human:\n%1\n### Assistant:\n";
-    QString m_systemPrompt        = "### System:\nYou are an AI assistant who gives a quality response to whatever humans ask of you.\n";
+    double  m_temperature          = 0.7;
+    double  m_topP                 = 0.4;
+    int     m_topK                 = 40;
+    int     m_maxLength            = 4096;
+    int     m_promptBatchSize      = 128;
+    int     m_contextLength        = 2048;
+    mutable int m_maxContextLength = -1;
+    int     m_gpuLayers            = 100;
+    mutable int m_maxGpuLayers     = -1;
+    double  m_repeatPenalty        = 1.18;
+    int     m_repeatPenaltyTokens  = 64;
+    QString m_promptTemplate       = "### Human:\n%1\n### Assistant:\n";
+    QString m_systemPrompt         = "### System:\nYou are an AI assistant who gives a quality response to whatever humans ask of you.\n";
     friend class MySettings;
 };
 Q_DECLARE_METATYPE(ModelInfo)
@@ -232,6 +242,7 @@ public:
         MaxLengthRole,
         PromptBatchSizeRole,
         ContextLengthRole,
+        GpuLayersRole,
         RepeatPenaltyRole,
         RepeatPenaltyTokensRole,
         PromptTemplateRole,
@@ -275,6 +286,7 @@ public:
         roles[MaxLengthRole] = "maxLength";
         roles[PromptBatchSizeRole] = "promptBatchSize";
         roles[ContextLengthRole] = "contextLength";
+        roles[GpuLayersRole] = "gpuLayers";
         roles[RepeatPenaltyRole] = "repeatPenalty";
         roles[RepeatPenaltyTokensRole] = "repeatPenaltyTokens";
         roles[PromptTemplateRole] = "promptTemplate";
diff --git a/gpt4all-chat/mysettings.cpp b/gpt4all-chat/mysettings.cpp
index 5f5c7b80..f9774bde 100644
--- a/gpt4all-chat/mysettings.cpp
+++ b/gpt4all-chat/mysettings.cpp
@@ -91,6 +91,7 @@ void MySettings::restoreModelDefaults(const ModelInfo &model)
     setModelMaxLength(model, model.m_maxLength);
     setModelPromptBatchSize(model, model.m_promptBatchSize);
     setModelContextLength(model, model.m_contextLength);
+    setModelGpuLayers(model, model.m_gpuLayers);
     setModelRepeatPenalty(model, model.m_repeatPenalty);
     setModelRepeatPenaltyTokens(model, model.m_repeatPenaltyTokens);
     setModelPromptTemplate(model, model.m_promptTemplate);
@@ -303,6 +304,28 @@ void MySettings::setModelContextLength(const ModelInfo &m, int l, bool force)
         emit contextLengthChanged(m);
 }
 
+int MySettings::modelGpuLayers(const ModelInfo &m) const
+{
+    QSettings setting;
+    setting.sync();
+    return setting.value(QString("model-%1").arg(m.id()) + "/gpuLayers", m.m_gpuLayers).toInt();
+}
+
+void MySettings::setModelGpuLayers(const ModelInfo &m, int l, bool force)
+{
+    if (modelGpuLayers(m) == l && !force)
+        return;
+
+    QSettings setting;
+    if (m.m_gpuLayers == l && !m.isClone)
+        setting.remove(QString("model-%1").arg(m.id()) + "/gpuLayers");
+    else
+        setting.setValue(QString("model-%1").arg(m.id()) + "/gpuLayers", l);
+    setting.sync();
+    if (!force)
+        emit gpuLayersChanged(m);
+}
+
 double MySettings::modelRepeatPenalty(const ModelInfo &m) const
 {
     QSettings setting;
diff --git a/gpt4all-chat/mysettings.h b/gpt4all-chat/mysettings.h
index 3287f413..4bfbef6b 100644
--- a/gpt4all-chat/mysettings.h
+++ b/gpt4all-chat/mysettings.h
@@ -63,6 +63,8 @@ public:
     Q_INVOKABLE void setModelSystemPrompt(const ModelInfo &m, const QString &p, bool force = false);
     int modelContextLength(const ModelInfo &m) const;
     Q_INVOKABLE void setModelContextLength(const ModelInfo &m, int s, bool force = false);
+    int modelGpuLayers(const ModelInfo &m) const;
+    Q_INVOKABLE void setModelGpuLayers(const ModelInfo &m, int s, bool force = false);
 
     // Application settings
     int threadCount() const;
@@ -85,6 +87,8 @@ public:
     void setDevice(const QString &u);
     int32_t contextLength() const;
     void setContextLength(int32_t value);
+    int32_t gpuLayers() const;
+    void setGpuLayers(int32_t value);
 
     // Release/Download settings
     QString lastVersionStarted() const;
@@ -121,6 +125,7 @@ Q_SIGNALS:
     void maxLengthChanged(const ModelInfo &model);
     void promptBatchSizeChanged(const ModelInfo &model);
     void contextLengthChanged(const ModelInfo &model);
+    void gpuLayersChanged(const ModelInfo &model);
     void repeatPenaltyChanged(const ModelInfo &model);
     void repeatPenaltyTokensChanged(const ModelInfo &model);
     void promptTemplateChanged(const ModelInfo &model);
diff --git a/gpt4all-chat/qml/ModelSettings.qml b/gpt4all-chat/qml/ModelSettings.qml
index 1c9b0249..ce2f5157 100644
--- a/gpt4all-chat/qml/ModelSettings.qml
+++ b/gpt4all-chat/qml/ModelSettings.qml
@@ -332,9 +332,6 @@ MySettingsTab {
                 ToolTip.visible: hovered
                 Layout.row: 0
                 Layout.column: 1
-                validator: IntValidator {
-                    bottom: 1
-                }
                 Connections {
                     target: MySettings
                     function onContextLengthChanged() {
@@ -349,11 +346,18 @@ MySettingsTab {
                 }
                 onEditingFinished: {
                     var val = parseInt(text)
-                    if (!isNaN(val)) {
+                    if (isNaN(val)) {
+                        text = root.currentModelInfo.contextLength
+                    } else {
+                        if (val < 8) {
+                            val = 8
+                            contextLengthField.text = val
+                        } else if (val > root.currentModelInfo.maxContextLength) {
+                            val = root.currentModelInfo.maxContextLength
+                            contextLengthField.text = val
+                        }
                         MySettings.setModelContextLength(root.currentModelInfo, val)
                         focus = false
-                    } else {
-                        text = root.currentModelInfo.contextLength
                     }
                 }
                 Accessible.role: Accessible.EditableText
@@ -674,6 +678,60 @@ MySettingsTab {
                 Accessible.name: repeatPenaltyTokensLabel.text
                 Accessible.description: ToolTip.text
             }
+
+            MySettingsLabel {
+                id: gpuLayersLabel
+                visible: !root.currentModelInfo.isChatGPT
+                text: qsTr("GPU Layers")
+                Layout.row: 4
+                Layout.column: 0
+            }
+            MyTextField {
+                id: gpuLayersField
+                visible: !root.currentModelInfo.isChatGPT
+                text: root.currentModelInfo.gpuLayers
+                font.pixelSize: theme.fontSizeLarge
+                color: theme.textColor
+                ToolTip.text: qsTr("How many GPU layers to load into VRAM. Decrease this if GPT4All runs out of VRAM while loading this model.\nLower values increase CPU load and RAM usage, and make inference slower.\nNOTE: Does not take effect until you RESTART GPT4All or SWITCH MODELS.")
+                ToolTip.visible: hovered
+                Layout.row: 4
+                Layout.column: 1
+                Connections {
+                    target: MySettings
+                    function onGpuLayersChanged() {
+                        gpuLayersField.text = root.currentModelInfo.gpuLayers
+                    }
+                }
+                Connections {
+                    target: root
+                    function onCurrentModelInfoChanged() {
+                        if (root.currentModelInfo.gpuLayers == 100) {
+                            gpuLayersField.text = root.currentModelInfo.maxGpuLayers
+                        } else {
+                            gpuLayersField.text = root.currentModelInfo.gpuLayers
+                        }
+                    }
+                }
+                onEditingFinished: {
+                    var val = parseInt(text)
+                    if (isNaN(val)) {
+                        gpuLayersField.text = root.currentModelInfo.gpuLayers
+                    } else {
+                        if (val < 1) {
+                            val = 1
+                            gpuLayersField.text = val
+                        } else if (val > root.currentModelInfo.maxGpuLayers) {
+                            val = root.currentModelInfo.maxGpuLayers
+                            gpuLayersField.text = val
+                        }
+                        MySettings.setModelGpuLayers(root.currentModelInfo, val)
+                        focus = false
+                    }
+                }
+                Accessible.role: Accessible.EditableText
+                Accessible.name: gpuLayersLabel.text
+                Accessible.description: ToolTip.text
+            }
         }
 
         Rectangle {