|
|
@ -469,26 +469,25 @@ bool mpt_eval(
|
|
|
|
|
|
|
|
|
|
|
|
const int d_key = n_embd/n_head;
|
|
|
|
const int d_key = n_embd/n_head;
|
|
|
|
|
|
|
|
|
|
|
|
static size_t buf_size = 256u*1024*1024;
|
|
|
|
static size_t buf_size = 1024u*MB;
|
|
|
|
static void * buf = malloc(buf_size);
|
|
|
|
if (!model.buf.addr || model.buf.size < buf_size)
|
|
|
|
|
|
|
|
model.buf.resize(buf_size);
|
|
|
|
|
|
|
|
|
|
|
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
|
|
|
if (mem_per_token > 0 && mem_per_token*N > model.buf.size) {
|
|
|
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
|
|
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
|
|
|
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
|
|
|
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new);
|
|
|
|
|
|
|
|
|
|
|
|
// reallocate
|
|
|
|
// reallocate
|
|
|
|
buf_size = buf_size_new;
|
|
|
|
model.buf.resize(buf_size_new);
|
|
|
|
buf = realloc(buf, buf_size);
|
|
|
|
if (model.buf.addr == nullptr) {
|
|
|
|
if (buf == nullptr) {
|
|
|
|
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.buf.size);
|
|
|
|
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_init_params params = {
|
|
|
|
struct ggml_init_params params = {
|
|
|
|
.mem_size = buf_size,
|
|
|
|
.mem_size = model.buf.size,
|
|
|
|
.mem_buffer = buf,
|
|
|
|
.mem_buffer = model.buf.addr,
|
|
|
|
.no_alloc = false,
|
|
|
|
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
|
|
struct ggml_context * ctx0 = ggml_init(params);
|
|
|
|