diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
index 0aa64234..b4833025 100644
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@@ -100,6 +100,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 
     add_library(replit-mainline-${BUILD_VARIANT} SHARED
     replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+    target_compile_definitions(replit-mainline-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
     prepare_target(replit-mainline llama-mainline)
 
     if (NOT LLAMA_METAL)
@@ -120,6 +121,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 
         add_library(falcon-${BUILD_VARIANT} SHARED
             falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(falcon-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
         prepare_target(falcon llama-mainline)
 
         add_library(mpt-${BUILD_VARIANT} SHARED
@@ -128,6 +130,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 
         add_library(bert-${BUILD_VARIANT} SHARED
             bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
         prepare_target(bert llama-mainline)
 
         add_library(starcoder-${BUILD_VARIANT} SHARED
diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp
index 29532c48..77bdadce 100644
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@@ -1,5 +1,6 @@
 #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "bert_impl.h"
+#include "llmodel_shared.h"
 #include "ggml.h"
 
 #include <cassert>
@@ -91,22 +92,6 @@ struct bert_model
 };
 
 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct bert_buffer {
-    uint8_t * data = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] data;
-        data = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~bert_buffer() {
-        delete[] data;
-    }
-};
-
-
 struct bert_ctx
 {
     bert_model model;
@@ -115,7 +100,8 @@ struct bert_ctx
     size_t mem_per_token;
     int64_t mem_per_input;
     int32_t max_batch_n;
-    bert_buffer buf_compute;
+    llm_buffer buf_compute;
+    llm_buffer work_buf;
 };
 
 int32_t bert_n_embd(bert_ctx * ctx)
@@ -328,13 +314,12 @@ void bert_eval(
 
     struct ggml_init_params params = {
         .mem_size = buf_compute.size,
-        .mem_buffer = buf_compute.data,
+        .mem_buffer = buf_compute.addr,
         .no_alloc = false,
     };
 
     struct ggml_context *ctx0 = ggml_init(params);
     struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
 
     // Embeddings. word_embeddings + token_type_embeddings + position_embeddings
     struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -466,7 +451,9 @@ void bert_eval(
     ggml_tensor *output = inpL;
     // run the computation
     ggml_build_forward_expand(&gf, output);
-    ggml_graph_compute(ctx0, &gf);
+    //ggml_graph_compute_g4a()
+    ggml_graph_compute_g4a(ctx->work_buf, &gf, n_threads);
+    //ggml_graph_compute(ctx0, &gf);
 
 
     // float *dat = ggml_get_data_f32(output);
@@ -633,7 +620,7 @@ struct bert_ctx * bert_load_from_file(const char *fname)
         model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b
         model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b
 
-        model_mem_req += (5 + 16 * n_layer) * 256; // object overhead
+        model_mem_req += (5 + 16 * n_layer) * ggml_tensor_overhead(); // object overhead
 
 #if defined(DEBUG_BERT)
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
@@ -1063,4 +1050,4 @@ DLL_EXPORT bool magic_match(std::istream& f) {
 DLL_EXPORT LLModel *construct() {
     return new Bert;
 }
-}
\ No newline at end of file
+}
diff --git a/gpt4all-backend/falcon.cpp b/gpt4all-backend/falcon.cpp
index 8fe99c3a..fe1ec9f4 100644
--- a/gpt4all-backend/falcon.cpp
+++ b/gpt4all-backend/falcon.cpp
@@ -1,3 +1,4 @@
+#include "ggml.h"
 #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "falcon_impl.h"
 #include "llama.h"
@@ -64,6 +65,7 @@ struct falcon_model {
     std::map<std::string, struct ggml_tensor*> tensors;
 
     llm_buffer eval_buf;
+    llm_buffer work_buf;
     llm_buffer scr0_buf;
     llm_buffer scr1_buf;
 };
@@ -446,7 +448,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
 //   - embd_w:    the predicted logits for the next token
 //
 bool falcon_eval(
-        const falcon_model & model,
+        falcon_model & model,
         const int n_threads,
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
@@ -473,7 +475,6 @@ bool falcon_eval(
 
     struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
     struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@@ -546,8 +547,8 @@ bool falcon_eval(
                 head_dim * (n_head + n_head_kv) * sizeof_wtype);
 
             // using mode = 2 for neox mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2);
+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, n_ctx);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, n_ctx);
 
             // store key and value to memory
             {
@@ -678,7 +679,8 @@ bool falcon_eval(
 
     // run the computation
     ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
+  
 
     //if (n_past%100 == 0) {
     //    ggml_graph_print   (&gf);
diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline
index da760ac3..69796668 160000
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@@ -1 +1 @@
-Subproject commit da760ac3829a89ab9d60ec797df8a570b9b8419a
+Subproject commit 697966680b27d9b4f05668605b863cb9aea3e15f
diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h
index 6a66a5d1..2bc9ae77 100644
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <cstdint>
 #include <cstddef>
+#include <vector>
 #include <ggml.h>
 
 struct llm_buffer {
@@ -34,3 +35,14 @@ struct llm_kv_cache {
         }
     }
 };
+
+#if LLAMA_DATE >= 230519
+inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.addr;
+    }
+    ggml_graph_compute(graph, &plan);
+}
+#endif
diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp
index 71a5cae4..b535b83e 100644
--- a/gpt4all-backend/replit.cpp
+++ b/gpt4all-backend/replit.cpp
@@ -196,6 +196,7 @@ struct replit_model {
 
     struct ggml_context * ctx;
     llm_buffer eval_buf;
+    llm_buffer work_buf;
     llm_buffer scr0_buf;
     llm_buffer scr1_buf;
     #ifdef GGML_USE_METAL
@@ -490,7 +491,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
    model.scr1_buf.resize(256u * 1024 * 1024);
 
 #ifdef GGML_USE_METAL
-    model.ctx_metal = ggml_metal_init();
+    model.ctx_metal = ggml_metal_init(1);
     void* data_ptr = ggml_get_mem_buffer(model.ctx);
     size_t data_size = ggml_get_mem_size(model.ctx);
     const size_t max_size = ggml_get_max_tensor_size(model.ctx);
@@ -534,7 +535,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
-bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
+bool replit_eval(replit_model & model, const int n_threads, const int n_past,
                  const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, size_t & mem_per_token) {
     const int N = embd_inp.size();
 
@@ -552,7 +553,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
         .no_alloc = false,
     };
     struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
-    struct ggml_cgraph gf = {.n_threads = n_threads};
+    struct ggml_cgraph gf = {};
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
@@ -706,10 +707,10 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
         ggml_metal_get_tensor(model.ctx_metal, model.kv_self.k);
         ggml_metal_get_tensor(model.ctx_metal, model.kv_self.v);
 
-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
     }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
 #endif
 
     // std::cout << "Qcur" << std::endl;