@@ -2482,6 +2482,7 @@ struct llama_cparams {
24822482 bool causal_attn;
24832483 bool offload_kqv;
24842484 bool flash_attn;
2485+ bool no_perf;
24852486
24862487 enum llama_pooling_type pooling_type;
24872488
@@ -6647,8 +6648,6 @@ static bool llm_load_tensors(
66476648 bool use_mlock,
66486649 llama_progress_callback progress_callback,
66496650 void * progress_callback_user_data) {
6650- model.t_start_us = ggml_time_us();
6651-
66526651 auto & hparams = model.hparams;
66536652
66546653 model.split_mode = split_mode;
@@ -8579,14 +8578,13 @@ static bool llm_load_tensors(
85798578 }
85808579 }
85818580
8582- // loading time will be recalculate after the first eval, so
8583- // we take page faults deferred by mmap() into consideration
8584- model.t_load_us = ggml_time_us() - model.t_start_us;
85858581 return true;
85868582}
85878583
85888584// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
85898585static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8586+ model.t_start_us = ggml_time_us();
8587+
85908588 try {
85918589 llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
85928590
@@ -8648,6 +8646,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
86488646 return -1;
86498647 }
86508648
8649+ // loading time will be recalculate after the first eval, so
8650+ // we take page faults deferred by mmap() into consideration
8651+ model.t_load_us = ggml_time_us() - model.t_start_us;
8652+
86518653 return 0;
86528654}
86538655
@@ -17915,6 +17917,7 @@ struct llama_context_params llama_context_default_params() {
1791517917 /*.embeddings =*/ false,
1791617918 /*.offload_kqv =*/ true,
1791717919 /*.flash_attn =*/ false,
17920+ /*.no_perf =*/ true,
1791817921 /*.abort_callback =*/ nullptr,
1791917922 /*.abort_callback_data =*/ nullptr,
1792017923 };
@@ -18125,6 +18128,7 @@ struct llama_context * llama_new_context_with_model(
1812518128 cparams.embeddings = params.embeddings;
1812618129 cparams.offload_kqv = params.offload_kqv;
1812718130 cparams.flash_attn = params.flash_attn;
18131+ cparams.no_perf = params.no_perf;
1812818132 cparams.pooling_type = params.pooling_type;
1812918133
1813018134 cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -20043,10 +20047,14 @@ void llama_synchronize(struct llama_context * ctx) {
2004320047
2004420048 // add the evaluation to the stats
2004520049 if (ctx->n_queued_tokens == 1) {
20046- ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20050+ if (!ctx->cparams.no_perf) {
20051+ ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20052+ }
2004720053 ctx->n_eval++;
2004820054 } else if (ctx->n_queued_tokens > 1) {
20049- ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20055+ if (!ctx->cparams.no_perf) {
20056+ ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20057+ }
2005020058 ctx->n_p_eval += ctx->n_queued_tokens;
2005120059 }
2005220060
@@ -20653,39 +20661,61 @@ const char * llama_print_system_info(void) {
2065320661 return s.c_str();
2065420662}
2065520663
20656- void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20664+ llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
20665+ llama_perf_data data = {};
20666+
20667+ if (ctx == nullptr) {
20668+ return data;
20669+ }
20670+
2065720671 switch (type) {
2065820672 case LLAMA_PERF_TYPE_CONTEXT:
2065920673 {
2066020674 const auto * p = (const struct llama_context *) ctx;
2066120675
20662- const double t_start_ms = 1e-3 * p->t_start_us;
20663- const double t_end_ms = 1.00 * ggml_time_ms();
20664- const double t_load_ms = 1e-3 * p->t_load_us;
20665- const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20666- const double t_eval_ms = 1e-3 * p->t_eval_us;
20676+ data.t_start_ms = 1e-3 * p->t_start_us;
20677+ data.t_load_ms = 1e-3 * p->t_load_us;;
20678+ data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20679+ data.t_eval_ms = 1e-3 * p->t_eval_us;
20680+ data.n_p_eval = std::max(1, p->n_p_eval);
20681+ data.n_eval = std::max(1, p->n_eval);
20682+ } break;
20683+ case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20684+ {
20685+ const auto * smpl = (const struct llama_sampler *) ctx;
20686+ const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
2066720687
20668- const int32_t n_p_eval = std::max(0, p->n_p_eval);
20669- const int32_t n_eval = std::max(1, p->n_eval);
20688+ data.t_sample_ms = 1e-3 * p->t_sample_us;
20689+ data.n_sample = std::max(0, p->n_sample);
20690+ } break;
20691+ default:
20692+ GGML_ABORT("invalid perf type");
20693+ }
20694+
20695+ return data;
20696+ }
2067020697
20671- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20698+ void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20699+ switch (type) {
20700+ case LLAMA_PERF_TYPE_CONTEXT:
20701+ {
20702+ const auto data = llama_perf_get(ctx, type);
20703+
20704+ const double t_end_ms = 1e-3 * ggml_time_us();
20705+
20706+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
2067220707 LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20673- __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20708+ __func__, data. t_p_eval_ms, data. n_p_eval, data. t_p_eval_ms / data. n_p_eval, 1e3 / data. t_p_eval_ms * data. n_p_eval);
2067420709 LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20675- __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20676- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20710+ __func__, data. t_eval_ms, data. n_eval, data. t_eval_ms / data. n_eval, 1e3 / data. t_eval_ms * data. n_eval);
20711+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data. t_start_ms), (data. n_p_eval + data. n_eval));
2067720712 } break;
2067820713 case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
2067920714 {
20680- const auto * smpl = (const struct llama_sampler *) ctx;
20681- const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20682-
20683- const double t_sampler_ms = 1e-3 * p->t_sample_us;
20684-
20685- const int32_t n_sampler = std::max(0, p->n_sample);
20715+ const auto data = llama_perf_get(ctx, type);
2068620716
2068720717 LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20688- __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler , 1e3 / t_sampler_ms * n_sampler );
20718+ __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample , 1e3 / data.t_sample_ms * data.n_sample );
2068920719 } break;
2069020720 default:
2069120721 GGML_ABORT("invalid perf type");
@@ -20705,7 +20735,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
2070520735 case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
2070620736 {
2070720737 auto * smpl = (struct llama_sampler *) ctx;
20708- auto * p = (struct llama_sampler_chain *) smpl->ctx;
20738+ auto * p = (struct llama_sampler_chain *) smpl->ctx;
2070920739
2071020740 p->t_sample_us = p->n_sample = 0;
2071120741 } break;
0 commit comments