From 596592ce12dc8d375b8c2e809cb98ba0ede29684 Mon Sep 17 00:00:00 2001 From: Adam Treat Date: Sun, 9 Apr 2023 10:24:47 -0400 Subject: [PATCH] Time how long it takes to process the prompt. --- gptj.cpp | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/gptj.cpp b/gptj.cpp index 214c1e36..7319b62e 100644 --- a/gptj.cpp +++ b/gptj.cpp @@ -424,7 +424,7 @@ bool gptj_eval( if (mem_per_token > 0 && mem_per_token*N > buf_size) { const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead - //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); // reallocate buf_size = buf_size_new; @@ -634,8 +634,6 @@ struct GPTJPrivate { bool modelLoaded; gpt_vocab vocab; gptj_model model; - int64_t t_main_start_us = 0; - int64_t t_load_us = 0; int64_t n_threads = 0; std::mt19937 rng; }; @@ -647,20 +645,13 @@ GPTJ::GPTJ() } bool GPTJ::loadModel(const std::string &modelPath, std::istream &fin) { - d_ptr->t_main_start_us = ggml_time_us(); std::mt19937 rng(time(NULL)); d_ptr->rng = rng; // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gptj_model_load(modelPath, fin, d_ptr->model, d_ptr->vocab)) { - std::cerr << "GPT-J ERROR: failed to load model from" << modelPath; - return false; - } - - d_ptr->t_load_us = ggml_time_us() - t_start_us; + if (!gptj_model_load(modelPath, fin, d_ptr->model, d_ptr->vocab)) { + std::cerr << "GPT-J ERROR: failed to load model from" << modelPath; + return false; } d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); @@ -687,10 +678,12 @@ void GPTJ::prompt(const std::string &prompt, std::function logits; @@ -724,6 +717,8 @@ void GPTJ::prompt(const std::string &prompt, std::function= embd_inp.size()) { + t_prompt_us += ggml_time_us() - t_main_start_us; + // sample next token const int n_vocab = d_ptr->model.hparams.n_vocab; @@ -772,12 +767,11 @@ stop_generating: const int64_t t_main_end_us = ggml_time_us(); std::cout << "GPT-J INFO: mem per token = " << mem_per_token << " bytes\n"; - std::cout << "GPT-J INFO: load time = " << d_ptr->t_load_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: sample time = " << t_sample_us/1000.0f << " ms\n"; + std::cout << "GPT-J INFO: prompt time = " << t_prompt_us/1000.0f << " ms\n"; std::cout << "GPT-J INFO: predict time = " << t_predict_us/1000.0f << " ms / " << t_predict_us/1000.0f/n_past << " ms per token\n"; - std::cout << "GPT-J INFO: total time = " << (t_main_end_us - d_ptr->t_main_start_us)/1000.0f << " ms\n"; + std::cout << "GPT-J INFO: total time = " << (t_main_end_us - t_main_start_us)/1000.0f << " ms\n"; fflush(stdout); - fflush(stderr); } #endif