File tree Expand file tree Collapse file tree 3 files changed +8
-12
lines changed Expand file tree Collapse file tree 3 files changed +8
-12
lines changed Original file line number Diff line number Diff line change @@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10961096 cparams.n_threads = params.cpuparams .n_threads ;
10971097 cparams.n_threads_batch = params.cpuparams_batch .n_threads == -1 ?
10981098 params.cpuparams .n_threads : params.cpuparams_batch .n_threads ;
1099- cparams.logits_all = false ;
11001099 cparams.embeddings = params.embedding ;
11011100 cparams.rope_scaling_type = params.rope_scaling_type ;
11021101 cparams.rope_freq_base = params.rope_freq_base ;
Original file line number Diff line number Diff line change @@ -351,19 +351,17 @@ extern "C" {
351351 enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
352352 enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
353353
354- // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
355- // TODO: move at the end of the struct
356- bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
357- bool embeddings; // if true, extract embeddings (together with logits)
358- bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
359- bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
360- bool no_perf; // whether to measure performance timings
361-
362354 // Abort callback
363355 // if it returns true, execution of llama_decode() will be aborted
364356 // currently works only with CPU execution
365357 ggml_abort_callback abort_callback;
366358 void * abort_callback_data;
359+
360+ // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
361+ bool embeddings; // if true, extract embeddings (together with logits)
362+ bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
363+ bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
364+ bool no_perf; // whether to measure performance timings
367365 };
368366
369367 // model quantization parameters
Original file line number Diff line number Diff line change @@ -1851,13 +1851,12 @@ llama_context_params llama_context_default_params() {
18511851 /* .cb_eval_user_data =*/ nullptr ,
18521852 /* .type_k =*/ GGML_TYPE_F16,
18531853 /* .type_v =*/ GGML_TYPE_F16,
1854- /* .logits_all =*/ false ,
1854+ /* .abort_callback =*/ nullptr ,
1855+ /* .abort_callback_data =*/ nullptr ,
18551856 /* .embeddings =*/ false ,
18561857 /* .offload_kqv =*/ true ,
18571858 /* .flash_attn =*/ false ,
18581859 /* .no_perf =*/ true ,
1859- /* .abort_callback =*/ nullptr ,
1860- /* .abort_callback_data =*/ nullptr ,
18611860 };
18621861
18631862 return result;
You can’t perform that action at this time.
0 commit comments