diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt index 0aa64234..b4833025 100644 --- a/gpt4all-backend/CMakeLists.txt +++ b/gpt4all-backend/CMakeLists.txt @@ -100,6 +100,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) add_library(replit-mainline-${BUILD_VARIANT} SHARED replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) + target_compile_definitions(replit-mainline-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) prepare_target(replit-mainline llama-mainline) if (NOT LLAMA_METAL) @@ -120,6 +121,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) add_library(falcon-${BUILD_VARIANT} SHARED falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) + target_compile_definitions(falcon-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) prepare_target(falcon llama-mainline) add_library(mpt-${BUILD_VARIANT} SHARED @@ -128,6 +130,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS) add_library(bert-${BUILD_VARIANT} SHARED bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h) + target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999) prepare_target(bert llama-mainline) add_library(starcoder-${BUILD_VARIANT} SHARED diff --git a/gpt4all-backend/bert.cpp b/gpt4all-backend/bert.cpp index 29532c48..77bdadce 100644 --- a/gpt4all-backend/bert.cpp +++ b/gpt4all-backend/bert.cpp @@ -1,5 +1,6 @@ #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #include "bert_impl.h" +#include "llmodel_shared.h" #include "ggml.h" #include @@ -91,22 +92,6 @@ struct bert_model }; // Replacement for std::vector that doesn't require zero-initialization. -struct bert_buffer { - uint8_t * data = NULL; - size_t size = 0; - - void resize(size_t size) { - delete[] data; - data = new uint8_t[size]; - this->size = size; - } - - ~bert_buffer() { - delete[] data; - } -}; - - struct bert_ctx { bert_model model; @@ -115,7 +100,8 @@ struct bert_ctx size_t mem_per_token; int64_t mem_per_input; int32_t max_batch_n; - bert_buffer buf_compute; + llm_buffer buf_compute; + llm_buffer work_buf; }; int32_t bert_n_embd(bert_ctx * ctx) @@ -328,13 +314,12 @@ void bert_eval( struct ggml_init_params params = { .mem_size = buf_compute.size, - .mem_buffer = buf_compute.data, + .mem_buffer = buf_compute.addr, .no_alloc = false, }; struct ggml_context *ctx0 = ggml_init(params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; // Embeddings. word_embeddings + token_type_embeddings + position_embeddings struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); @@ -466,7 +451,9 @@ void bert_eval( ggml_tensor *output = inpL; // run the computation ggml_build_forward_expand(&gf, output); - ggml_graph_compute(ctx0, &gf); + //ggml_graph_compute_g4a() + ggml_graph_compute_g4a(ctx->work_buf, &gf, n_threads); + //ggml_graph_compute(ctx0, &gf); // float *dat = ggml_get_data_f32(output); @@ -633,7 +620,7 @@ struct bert_ctx * bert_load_from_file(const char *fname) model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b - model_mem_req += (5 + 16 * n_layer) * 256; // object overhead + model_mem_req += (5 + 16 * n_layer) * ggml_tensor_overhead(); // object overhead #if defined(DEBUG_BERT) printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0)); @@ -1063,4 +1050,4 @@ DLL_EXPORT bool magic_match(std::istream& f) { DLL_EXPORT LLModel *construct() { return new Bert; } -} \ No newline at end of file +} diff --git a/gpt4all-backend/falcon.cpp b/gpt4all-backend/falcon.cpp index 8fe99c3a..fe1ec9f4 100644 --- a/gpt4all-backend/falcon.cpp +++ b/gpt4all-backend/falcon.cpp @@ -1,3 +1,4 @@ +#include "ggml.h" #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE #include "falcon_impl.h" #include "llama.h" @@ -64,6 +65,7 @@ struct falcon_model { std::map tensors; llm_buffer eval_buf; + llm_buffer work_buf; llm_buffer scr0_buf; llm_buffer scr1_buf; }; @@ -446,7 +448,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca // - embd_w: the predicted logits for the next token // bool falcon_eval( - const falcon_model & model, + falcon_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, @@ -473,7 +475,6 @@ bool falcon_eval( struct ggml_context * ctx0 = ggml_init(eval_ctx_params); struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -546,8 +547,8 @@ bool falcon_eval( head_dim * (n_head + n_head_kv) * sizeof_wtype); // using mode = 2 for neox mode - Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2); - Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2); + Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, n_ctx); + Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, n_ctx); // store key and value to memory { @@ -678,7 +679,8 @@ bool falcon_eval( // run the computation ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute (ctx0, &gf); + ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); + //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/gpt4all-backend/llama.cpp-mainline b/gpt4all-backend/llama.cpp-mainline index da760ac3..69796668 160000 --- a/gpt4all-backend/llama.cpp-mainline +++ b/gpt4all-backend/llama.cpp-mainline @@ -1 +1 @@ -Subproject commit da760ac3829a89ab9d60ec797df8a570b9b8419a +Subproject commit 697966680b27d9b4f05668605b863cb9aea3e15f diff --git a/gpt4all-backend/llmodel_shared.h b/gpt4all-backend/llmodel_shared.h index 6a66a5d1..2bc9ae77 100644 --- a/gpt4all-backend/llmodel_shared.h +++ b/gpt4all-backend/llmodel_shared.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include struct llm_buffer { @@ -34,3 +35,14 @@ struct llm_kv_cache { } } }; + +#if LLAMA_DATE >= 230519 +inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.addr; + } + ggml_graph_compute(graph, &plan); +} +#endif diff --git a/gpt4all-backend/replit.cpp b/gpt4all-backend/replit.cpp index 71a5cae4..b535b83e 100644 --- a/gpt4all-backend/replit.cpp +++ b/gpt4all-backend/replit.cpp @@ -196,6 +196,7 @@ struct replit_model { struct ggml_context * ctx; llm_buffer eval_buf; + llm_buffer work_buf; llm_buffer scr0_buf; llm_buffer scr1_buf; #ifdef GGML_USE_METAL @@ -490,7 +491,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode model.scr1_buf.resize(256u * 1024 * 1024); #ifdef GGML_USE_METAL - model.ctx_metal = ggml_metal_init(); + model.ctx_metal = ggml_metal_init(1); void* data_ptr = ggml_get_mem_buffer(model.ctx); size_t data_size = ggml_get_mem_size(model.ctx); const size_t max_size = ggml_get_max_tensor_size(model.ctx); @@ -534,7 +535,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t // - embd_inp: the embeddings of the tokens in the context // - embd_w: the predicted logits for the next token // -bool replit_eval(const replit_model & model, const int n_threads, const int n_past, +bool replit_eval(replit_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, std::vector & embd_w, size_t & mem_per_token) { const int N = embd_inp.size(); @@ -552,7 +553,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(eval_ctx_params); - struct ggml_cgraph gf = {.n_threads = n_threads}; + struct ggml_cgraph gf = {}; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -706,10 +707,10 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa ggml_metal_get_tensor(model.ctx_metal, model.kv_self.k); ggml_metal_get_tensor(model.ctx_metal, model.kv_self.v); - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); } #else - ggml_graph_compute(ctx0, &gf); + ggml_graph_compute_g4a(model.work_buf, &gf, n_threads); #endif // std::cout << "Qcur" << std::endl;