@@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
12011201 uint64_t n_tokens_predicted_total = 0 ;
12021202 uint64_t t_tokens_generation_total = 0 ;
12031203
1204+ uint64_t n_past_max = 0 ;
1205+
12041206 uint64_t n_prompt_tokens_processed = 0 ;
12051207 uint64_t t_prompt_processing = 0 ;
12061208
@@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
12261228 { " n_tokens_predicted_total" , n_tokens_predicted_total },
12271229 { " t_prompt_processing_total" , t_prompt_processing_total },
12281230
1231+ { " n_past_max" , n_past_max },
1232+
12291233 { " n_prompt_tokens_processed" , n_prompt_tokens_processed },
12301234 { " t_prompt_processing" , t_prompt_processing },
12311235 { " n_tokens_predicted" , n_tokens_predicted },
@@ -1587,6 +1591,8 @@ struct server_metrics {
15871591 uint64_t n_tokens_predicted_total = 0 ;
15881592 uint64_t t_tokens_generation_total = 0 ;
15891593
1594+ uint64_t n_past_max = 0 ;
1595+
15901596 uint64_t n_prompt_tokens_processed = 0 ;
15911597 uint64_t t_prompt_processing = 0 ;
15921598
@@ -1605,6 +1611,10 @@ struct server_metrics {
16051611 n_prompt_tokens_processed += slot.n_prompt_tokens_processed ;
16061612 t_prompt_processing += slot.t_prompt_processing ;
16071613 t_prompt_processing_total += slot.t_prompt_processing ;
1614+
1615+ if (slot.n_past > 0 ) {
1616+ n_past_max = std::max (n_past_max, (uint64_t ) slot.n_past );
1617+ }
16081618 }
16091619
16101620 void on_prediction (const server_slot & slot) {
@@ -1620,6 +1630,9 @@ struct server_metrics {
16201630 if (slot.is_processing ()) {
16211631 n_busy_slots_total++;
16221632 }
1633+ if (slot.n_past > 0 ) {
1634+ n_past_max = std::max (n_past_max, (uint64_t ) slot.n_past );
1635+ }
16231636 }
16241637 }
16251638
@@ -2875,6 +2888,8 @@ struct server_context {
28752888 res->n_tokens_predicted_total = metrics.n_tokens_predicted_total ;
28762889 res->t_tokens_generation_total = metrics.t_tokens_generation_total ;
28772890
2891+ res->n_past_max = metrics.n_past_max ;
2892+
28782893 res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed ;
28792894 res->t_prompt_processing = metrics.t_prompt_processing ;
28802895 res->n_tokens_predicted = metrics.n_tokens_predicted ;
@@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
40774092 {" name" , " n_decode_total" },
40784093 {" help" , " Total number of llama_decode() calls" },
40794094 {" value" , res_metrics->n_decode_total }
4095+ }, {
4096+ {" name" , " n_past_max" },
4097+ {" help" , " Largest observed n_past." },
4098+ {" value" , res_metrics->n_past_max }
40804099 }, {
40814100 {" name" , " n_busy_slots_per_decode" },
40824101 {" help" , " Average number of busy slots per llama_decode() call" },
0 commit comments