@ -5,6 +5,7 @@
# include "llmodel_shared.h"
# include <cassert>
# include <cinttypes>
# include <cmath>
# include <cstdio>
# include <cstring>
@ -80,7 +81,9 @@ struct mpt_model {
std : : map < std : : string , struct ggml_tensor * > tensors ;
llm_buffer buf ;
llm_buffer eval_buf ;
llm_buffer scr0_buf ;
llm_buffer scr1_buf ;
~ mpt_model ( ) {
if ( ctx ) {
@ -370,8 +373,8 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod
}
if ( tensor - > ne [ 0 ] ! = ne [ 0 ] | | tensor - > ne [ 1 ] ! = ne [ 1 ] ) {
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [% d, %d], expected [%d, %d]\n " ,
__func__ , name . data ( ) , ( int ) tensor - > ne [ 0 ] , ( int ) tensor - > ne [ 1 ] , ne [ 0 ] , ne [ 1 ] ) ;
fprintf ( stderr , " %s: tensor '%s' has wrong shape in model file: got [% " PRI d64 " , %" PRI d64 " ], expected [%d, %d]\n " ,
__func__ , name . data ( ) , tensor - > ne [ 0 ] , tensor - > ne [ 1 ] , ne [ 0 ] , ne [ 1 ] ) ;
return false ;
}
@ -403,6 +406,9 @@ bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & mod
printf ( " %s: model size = %8.2f MB / num tensors = %d \n " , __func__ , total_size / 1024.0 / 1024.0 , n_tensors ) ;
}
model . scr0_buf . resize ( 256u * 1024 * 1024 ) ;
model . scr1_buf . resize ( 256u * 1024 * 1024 ) ;
return true ;
}
@ -438,24 +444,24 @@ bool mpt_eval(
const int n_vocab = hparams . n_vocab ;
const size_t init_buf_size = 1024 _MiB ;
if ( ! model . buf. addr | | model . buf. size < init_buf_size )
model . buf. resize ( init_buf_size ) ;
if ( ! model . eval_ buf. addr | | model . eval_ buf. size < init_buf_size )
model . eval_ buf. resize ( init_buf_size ) ;
if ( mem_per_token > 0 & & mem_per_token * N > model . buf. size ) {
if ( mem_per_token > 0 & & mem_per_token * N > model . eval_ buf. size ) {
const size_t buf_size_new = 1.1 * ( mem_per_token * N ) ; // add 10% to account for ggml object overhead
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new);
// reallocate
model . buf. resize ( buf_size_new ) ;
if ( model . buf. addr = = nullptr ) {
fprintf ( stderr , " %s: failed to allocate %zu bytes \n " , __func__ , model . buf. size ) ;
model . eval_ buf. resize ( buf_size_new ) ;
if ( model . eval_ buf. addr = = nullptr ) {
fprintf ( stderr , " %s: failed to allocate %zu bytes \n " , __func__ , model . eval_ buf. size ) ;
return false ;
}
}
struct ggml_init_params params = {
. mem_size = model . buf. size ,
. mem_buffer = model . buf. addr ,
. mem_size = model . eval_ buf. size ,
. mem_buffer = model . eval_ buf. addr ,
. no_alloc = false
} ;
@ -470,6 +476,7 @@ bool mpt_eval(
struct ggml_tensor * inpL = ggml_get_rows ( ctx0 , model . wte , embd ) ;
for ( int il = 0 ; il < n_layer ; + + il ) {
ggml_set_scratch ( ctx0 , { 0 , model . scr0_buf . size , model . scr0_buf . addr , } ) ;
struct ggml_tensor * inpSA = inpL ;
struct ggml_tensor * cur = inpSA ;
@ -561,7 +568,7 @@ bool mpt_eval(
cur ) ;
}
ggml_set_scratch ( ctx0 , { 0 , model . scr1_buf . size , model . scr1_buf . addr , } ) ;
// residual
struct ggml_tensor * resSA = ggml_add ( ctx0 , cur , inpSA ) ;
// feed-forward network
@ -586,6 +593,7 @@ bool mpt_eval(
// self-attention + FF
inpL = ggml_add ( ctx0 , cur , resSA ) ;
}
ggml_set_scratch ( ctx0 , { 0 , model . scr0_buf . size , model . scr0_buf . addr , } ) ;
struct ggml_tensor * out = inpL ;
// -> logits
@ -594,6 +602,7 @@ bool mpt_eval(
out = ggml_mul ( ctx0 ,
ggml_repeat ( ctx0 , model . norm_f_w , out ) ,
out ) ;
ggml_set_scratch ( ctx0 , { 0 , 0 , nullptr , } ) ;
out = ggml_mul_mat ( ctx0 , model . wte , out ) ;
}