@@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
583583 break ;
584584 }
585585 params.n_gpu_layers = std::stoi (argv[i]);
586- # ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
587- fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n " );
588- fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
589- # endif
586+ if (! llama_supports_gpu_offload ()) {
587+ fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n " );
588+ fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
589+ }
590590 } else if (arg == " --gpu-layers-draft" || arg == " -ngld" || arg == " --n-gpu-layers-draft" ) {
591591 if (++i >= argc) {
592592 invalid_param = true ;
593593 break ;
594594 }
595595 params.n_gpu_layers_draft = std::stoi (argv[i]);
596- # ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
597- fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n " );
598- fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
599- # endif
596+ if (! llama_supports_gpu_offload ()) {
597+ fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n " );
598+ fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
599+ }
600600 } else if (arg == " --main-gpu" || arg == " -mg" ) {
601601 if (++i >= argc) {
602602 invalid_param = true ;
@@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
637637 const std::regex regex{R"( [,/]+)" };
638638 std::sregex_token_iterator it{arg_next.begin (), arg_next.end (), regex, -1 };
639639 std::vector<std::string> split_arg{it, {}};
640- if (split_arg.size () >= LLAMA_MAX_DEVICES ) {
640+ if (split_arg.size () >= llama_max_devices () ) {
641641 invalid_param = true ;
642642 break ;
643643 }
644- for (size_t i = 0 ; i < LLAMA_MAX_DEVICES ; ++i) {
644+ for (size_t i = 0 ; i < llama_max_devices () ; ++i) {
645645 if (i < split_arg.size ()) {
646646 params.tensor_split [i] = std::stof (split_arg[i]);
647647 } else {
@@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
989989 printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
990990 printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n " );
991991 printf (" --image IMAGE_FILE path to an image file. use with multimodal models\n " );
992- if (llama_mlock_supported ()) {
992+ if (llama_supports_mlock ()) {
993993 printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
994994 }
995- if (llama_mmap_supported ()) {
995+ if (llama_supports_mmap ()) {
996996 printf (" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
997997 }
998998 printf (" --numa attempt optimizations that help on some NUMA systems\n " );
999999 printf (" if run without this previously, it is recommended to drop the system page cache before using this\n " );
10001000 printf (" see https://github.com/ggerganov/llama.cpp/issues/1437\n " );
1001- # ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1002- printf (" -ngl N, --n-gpu-layers N\n " );
1003- printf (" number of layers to store in VRAM\n " );
1004- printf (" -ngld N, --n-gpu-layers-draft N\n " );
1005- printf (" number of layers to store in VRAM for the draft model\n " );
1006- printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1007- printf (" how to split the model across multiple GPUs, one of:\n " );
1008- printf (" - none: use one GPU only\n " );
1009- printf (" - layer (default): split layers and KV across GPUs\n " );
1010- printf (" - row: split rows across GPUs\n " );
1011- printf (" -ts SPLIT, --tensor-split SPLIT\n " );
1012- printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1013- printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1014- printf (" or for intermediate results and KV (with split-mode = row) (default: %d)\n " , params.main_gpu );
1015- # endif // LLAMA_SUPPORTS_GPU_OFFLOAD
1001+ if ( llama_supports_gpu_offload ()) {
1002+ printf (" -ngl N, --n-gpu-layers N\n " );
1003+ printf (" number of layers to store in VRAM\n " );
1004+ printf (" -ngld N, --n-gpu-layers-draft N\n " );
1005+ printf (" number of layers to store in VRAM for the draft model\n " );
1006+ printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1007+ printf (" how to split the model across multiple GPUs, one of:\n " );
1008+ printf (" - none: use one GPU only\n " );
1009+ printf (" - layer (default): split layers and KV across GPUs\n " );
1010+ printf (" - row: split rows across GPUs\n " );
1011+ printf (" -ts SPLIT, --tensor-split SPLIT\n " );
1012+ printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1013+ printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1014+ printf (" or for intermediate results and KV (with split-mode = row) (default: %d)\n " , params.main_gpu );
1015+ }
10161016 printf (" --verbose-prompt print a verbose prompt before generation (default: %s)\n " , params.verbose_prompt ? " true" : " false" );
10171017 printf (" --no-display-prompt don't print prompt at generation (default: %s)\n " , !params.display_prompt ? " true" : " false" );
10181018 printf (" -gan N, --grp-attn-n N\n " );
@@ -1651,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
16511651 fprintf (stream, " cont_batching: %s # default: false\n " , params.cont_batching ? " true" : " false" );
16521652 fprintf (stream, " temp: %f # default: 0.8\n " , sparams.temp );
16531653
1654- const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + LLAMA_MAX_DEVICES );
1654+ const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + llama_max_devices () );
16551655 dump_vector_float_yaml (stream, " tensor_split" , tensor_split_vector);
16561656
16571657 fprintf (stream, " tfs: %f # default: 1.0\n " , sparams.tfs_z );
0 commit comments