@@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
1313use llama_cpp_2:: llama_backend:: LlamaBackend ;
1414use llama_cpp_2:: llama_batch:: LlamaBatch ;
1515use llama_cpp_2:: model:: params:: kv_overrides:: ParamOverrideValue ;
16- use llama_cpp_2:: model:: params:: LlamaModelParams ;
16+ use llama_cpp_2:: model:: params:: { LlamaModelParams , LlamaSplitMode } ;
1717use llama_cpp_2:: model:: LlamaModel ;
1818use llama_cpp_2:: model:: { AddBos , Special } ;
1919use llama_cpp_2:: sampling:: LlamaSampler ;
@@ -48,6 +48,23 @@ struct Args {
4848 #[ cfg( any( feature = "cuda" , feature = "vulkan" ) ) ]
4949 #[ clap( long) ]
5050 disable_gpu : bool ,
51+ /// Set main GPU device index (default: 0)
52+ ///
53+ /// By setting this option, multiple GPU is disabled.
54+ #[ arg(
55+ long,
56+ help = "Set main GPU device id (default: 0). Disables multi-GPU."
57+ ) ]
58+ main_gpu : Option < i32 > ,
59+ /// Set devices to use by index
60+ ///
61+ /// This option overrides `main-gpu` and enables multi-GPU.
62+ #[ arg(
63+ long,
64+ value_delimiter = ',' ,
65+ help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
66+ ) ]
67+ devices : Option < Vec < usize > > ,
5168 #[ cfg( any( feature = "cuda" , feature = "vulkan" ) ) ]
5269 #[ arg( long, help = "Keep MoE layers on CPU" ) ]
5370 cmoe : bool ,
@@ -72,6 +89,8 @@ struct Args {
7289 ctx_size : Option < NonZeroU32 > ,
7390 #[ arg( short = 'v' , long, help = "enable verbose llama.cpp logs" ) ]
7491 verbose : bool ,
92+ #[ arg( long, help = "list backend devices" ) ]
93+ list_devices : bool ,
7594}
7695
7796/// Parse a single key-value pair
@@ -132,6 +151,8 @@ fn main() -> Result<()> {
132151 file,
133152 #[ cfg( any( feature = "cuda" , feature = "vulkan" ) ) ]
134153 disable_gpu,
154+ main_gpu,
155+ devices,
135156 #[ cfg ( any ( feature = "cuda" , feature = "vulkan" ) ) ]
136157 cmoe,
137158 key_value_overrides,
@@ -140,6 +161,7 @@ fn main() -> Result<()> {
140161 threads_batch,
141162 ctx_size,
142163 verbose,
164+ list_devices,
143165 } = Args :: parse ( ) ;
144166
145167 if verbose {
@@ -151,8 +173,26 @@ fn main() -> Result<()> {
151173 // init LLM
152174 let backend = LlamaBackend :: init ( ) ?;
153175
176+ if list_devices {
177+ let devices = llama_cpp_2:: list_llama_ggml_backend_devices ( ) ;
178+ for ( i, dev) in devices. iter ( ) . enumerate ( ) {
179+ println ! ( "Device {i:>2}: {}" , dev. name) ;
180+ println ! ( " Description: {}" , dev. description) ;
181+ println ! ( " Device Type: {:?}" , dev. device_type) ;
182+ println ! ( " Backend: {}" , dev. backend) ;
183+ println ! (
184+ " Memory total: {:?} MiB" ,
185+ dev. memory_total / 1024 / 1024
186+ ) ;
187+ println ! (
188+ " Memory free: {:?} MiB" ,
189+ dev. memory_free / 1024 / 1024
190+ ) ;
191+ }
192+ }
193+
154194 // offload all layers to the gpu
155- let model_params = {
195+ let mut model_params = {
156196 #[ cfg( any( feature = "cuda" , feature = "vulkan" ) ) ]
157197 if !disable_gpu {
158198 LlamaModelParams :: default ( ) . with_n_gpu_layers ( 1000 )
@@ -163,6 +203,19 @@ fn main() -> Result<()> {
163203 LlamaModelParams :: default ( )
164204 } ;
165205
206+ if let Some ( devices) = devices {
207+ model_params = model_params
208+ . with_devices ( & devices)
209+ . with_context ( || "invalid device index in --devices" ) ?;
210+ if main_gpu. is_some ( ) {
211+ eprintln ! ( "warning: --devices overrides --main-gpu" ) ;
212+ }
213+ } else if let Some ( main_gpu) = main_gpu {
214+ model_params = model_params. with_main_gpu ( main_gpu) ;
215+ // Enable single GPU mode
216+ model_params = model_params. with_split_mode ( LlamaSplitMode :: None ) ;
217+ }
218+
166219 let prompt = if let Some ( str) = prompt {
167220 if file. is_some ( ) {
168221 bail ! ( "either prompt or file must be specified, but not both" )
0 commit comments