Skip to content

Commit 196f508

Browse files
authored
common : more accurate sampling timing (#17382)
* common : more accurate sampling timing * eval-callback : minor fixes * cont : add time_meas impl * cont : fix log msg [no ci] * cont : fix multiple definitions of time_meas * llama-cli : exclude chat template init from time measurement * cont : print percentage of unaccounted time * cont : do not reset timings
1 parent 5088b43 commit 196f508

File tree

7 files changed

+101
-32
lines changed

7 files changed

+101
-32
lines changed

common/common.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include <sstream>
2727
#include <string>
2828
#include <thread>
29-
#include <unordered_map>
3029
#include <unordered_set>
3130
#include <vector>
3231

@@ -60,6 +59,14 @@
6059
#pragma warning(disable: 4244 4267) // possible loss of data
6160
#endif
6261

62+
common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
63+
64+
common_time_meas::~common_time_meas() {
65+
if (t_start_us >= 0) {
66+
t_acc += ggml_time_us() - t_start_us;
67+
}
68+
}
69+
6370
//
6471
// CPU utils
6572
//

common/common.h

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,15 @@
22

33
#pragma once
44

5+
#include "ggml-opt.h"
6+
#include "llama-cpp.h"
7+
58
#include <set>
69
#include <sstream>
710
#include <string>
811
#include <string_view>
912
#include <vector>
1013
#include <map>
11-
#include <sstream>
12-
#include <cmath>
13-
14-
#include "ggml-opt.h"
15-
#include "llama-cpp.h"
1614

1715
#ifdef _WIN32
1816
#define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@
3028

3129
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3230

31+
struct common_time_meas {
32+
common_time_meas(int64_t & t_acc, bool disable = false);
33+
~common_time_meas();
34+
35+
const int64_t t_start_us;
36+
37+
int64_t & t_acc;
38+
};
39+
3340
struct common_adapter_lora_info {
3441
std::string path;
3542
float scale;

common/sampling.cpp

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
#include "common.h"
44
#include "log.h"
55

6+
#include <algorithm>
67
#include <cmath>
8+
#include <cstring>
79
#include <unordered_map>
8-
#include <algorithm>
910

1011
// the ring buffer works similarly to std::deque, but with a fixed capacity
1112
// TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {
112113

113114
llama_token_data_array cur_p;
114115

116+
void reset() {
117+
prev.clear();
118+
119+
llama_sampler_reset(grmr);
120+
llama_sampler_reset(chain);
121+
}
122+
115123
void set_logits(struct llama_context * ctx, int idx) {
116124
const auto * logits = llama_get_logits_ith(ctx, idx);
117125

@@ -128,6 +136,12 @@ struct common_sampler {
128136

129137
cur_p = { cur.data(), cur.size(), -1, false };
130138
}
139+
140+
common_time_meas tm() {
141+
return common_time_meas(t_total_us, params.no_perf);
142+
}
143+
144+
mutable int64_t t_total_us = 0;
131145
};
132146

133147
std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
298312
}
299313

300314
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
315+
const auto tm = gsmpl->tm();
316+
301317
if (accept_grammar) {
302318
llama_sampler_accept(gsmpl->grmr, token);
303319
}
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
308324
}
309325

310326
void common_sampler_reset(struct common_sampler * gsmpl) {
311-
llama_sampler_reset(gsmpl->grmr);
312-
313-
llama_sampler_reset(gsmpl->chain);
327+
gsmpl->reset();
314328
}
315329

316330
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
327341
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
328342
// TODO: measure grammar performance
329343

344+
const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
345+
346+
llama_perf_sampler_data data_smpl;
347+
llama_perf_context_data data_ctx;
348+
349+
memset(&data_smpl, 0, sizeof(data_smpl));
350+
memset(&data_ctx, 0, sizeof(data_ctx));
351+
330352
if (gsmpl) {
331-
llama_perf_sampler_print(gsmpl->chain);
353+
auto & data = data_smpl;
354+
355+
data = llama_perf_sampler(gsmpl->chain);
356+
357+
// note: the sampling time includes the samplers time + extra time spent in common/sampling
358+
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
359+
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
332360
}
361+
333362
if (ctx) {
334-
llama_perf_context_print(ctx);
363+
auto & data = data_ctx;
364+
365+
data = llama_perf_context(ctx);
366+
367+
const double t_end_ms = 1e-3 * ggml_time_us();
368+
369+
const double t_total_ms = t_end_ms - data.t_start_ms;
370+
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
371+
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;
372+
373+
LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
374+
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
375+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
376+
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
377+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
378+
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
379+
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
380+
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);
381+
335382
llama_memory_breakdown_print(ctx);
336383
}
337384
}
338385

339386
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
387+
llama_synchronize(ctx);
388+
389+
// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
390+
const auto tm = gsmpl->tm();
391+
340392
gsmpl->set_logits(ctx, idx);
341393

342394
auto & grmr = gsmpl->grmr;
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
428480
// helpers
429481

430482
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
483+
const auto tm = gsmpl->tm();
484+
431485
auto * res = &gsmpl->cur_p;
432486

433487
if (do_sort && !res->sorted) {

examples/eval-callback/eval-callback.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
#include "llama.h"
55
#include "ggml.h"
66

7+
#include <cmath>
78
#include <cstdio>
89
#include <string>
910
#include <vector>
10-
#include <numeric>
1111

1212
/**
1313
* This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
3737
return u.f;
3838
}
3939

40-
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
40+
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
4141
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
4242
float v;
4343
if (type == GGML_TYPE_F16) {
44-
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
44+
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
4545
} else if (type == GGML_TYPE_F32) {
46-
v = *(float *) &data[i];
46+
v = *(const float *) &data[i];
4747
} else if (type == GGML_TYPE_I64) {
48-
v = (float) *(int64_t *) &data[i];
48+
v = (float) *(const int64_t *) &data[i];
4949
} else if (type == GGML_TYPE_I32) {
50-
v = (float) *(int32_t *) &data[i];
50+
v = (float) *(const int32_t *) &data[i];
5151
} else if (type == GGML_TYPE_I16) {
52-
v = (float) *(int16_t *) &data[i];
52+
v = (float) *(const int16_t *) &data[i];
5353
} else if (type == GGML_TYPE_I8) {
54-
v = (float) *(int8_t *) &data[i];
54+
v = (float) *(const int8_t *) &data[i];
5555
} else if (type == GGML_TYPE_BF16) {
56-
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
56+
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
5757
} else {
5858
GGML_ABORT("fatal error");
5959
}

src/llama-impl.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
2020
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
2121

2222
time_meas::~time_meas() {
23-
if (t_start_us >= 0) {
24-
t_acc += ggml_time_us() - t_start_us;
25-
}
23+
if (t_start_us >= 0) {
24+
t_acc += ggml_time_us() - t_start_us;
2625
}
26+
}
2727

2828
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
2929
ggml_log_set(log_callback, user_data);

src/llama-sampling.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
472472
for (auto * smpl : chain->samplers) {
473473
llama_sampler_reset(smpl);
474474
}
475-
476-
chain->t_sample_us = 0;
477-
chain->n_sample = 0;
478475
}
479476

480477
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
26702667
void llama_perf_sampler_print(const struct llama_sampler * chain) {
26712668
const auto data = llama_perf_sampler(chain);
26722669

2673-
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
2674-
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
2670+
LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
26752671
}
26762672

26772673
void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
26812677

26822678
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
26832679

2684-
ctx->t_sample_us = ctx->n_sample = 0;
2680+
ctx->t_sample_us = 0;
2681+
ctx->n_sample = 0;
26852682
}

tools/main/main.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
147147
return 1;
148148
}
149149

150-
auto * mem = llama_get_memory(ctx);
151-
150+
llama_memory_t mem = llama_get_memory(ctx);
152151
const llama_vocab * vocab = llama_model_get_vocab(model);
152+
153+
// note: the time for chat template initialization is not negligible:
153154
auto chat_templates = common_chat_templates_init(model, params.chat_template);
154155

156+
// start measuring performance timings from here
157+
llama_perf_context_reset(ctx);
158+
155159
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
156160

157161
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);

0 commit comments

Comments
 (0)