Merge pull request #861 from kusaanko/selectable-backend-devices

MarcusDunn · web-flow · commit 9aa9771dcc69 · 2025-11-14T09:33:21.000-08:00
Add ability to select backend devices
diff --git a/examples/simple/src/main.rs b/examples/simple/src/main.rs
@@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
 use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
-use llama_cpp_2::model::params::LlamaModelParams;
+use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode};
 use llama_cpp_2::model::LlamaModel;
 use llama_cpp_2::model::{AddBos, Special};
 use llama_cpp_2::sampling::LlamaSampler;
@@ -48,6 +48,23 @@ struct Args {
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[clap(long)]
     disable_gpu: bool,
+    /// Set main GPU device index (default: 0)
+    ///
+    /// By setting this option, multiple GPU is disabled.
+    #[arg(
+        long,
+        help = "Set main GPU device id (default: 0). Disables multi-GPU."
+    )]
+    main_gpu: Option<i32>,
+    /// Set devices to use by index
+    ///
+    /// This option overrides `main-gpu` and enables multi-GPU.
+    #[arg(
+        long,
+        value_delimiter = ',',
+        help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
+    )]
+    devices: Option<Vec<usize>>,
     #[cfg(any(feature = "cuda", feature = "vulkan"))]
     #[arg(long, help = "Keep MoE layers on CPU")]
     cmoe: bool,
@@ -72,6 +89,8 @@ struct Args {
     ctx_size: Option<NonZeroU32>,
     #[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
     verbose: bool,
+    #[arg(long, help = "list backend devices")]
+    list_devices: bool,
 }
 
 /// Parse a single key-value pair
@@ -132,6 +151,8 @@ fn main() -> Result<()> {
         file,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         disable_gpu,
+        main_gpu,
+        devices,
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         cmoe,
         key_value_overrides,
@@ -140,6 +161,7 @@ fn main() -> Result<()> {
         threads_batch,
         ctx_size,
         verbose,
+        list_devices,
     } = Args::parse();
 
     if verbose {
@@ -151,8 +173,26 @@ fn main() -> Result<()> {
     // init LLM
     let backend = LlamaBackend::init()?;
 
+    if list_devices {
+        let devices = llama_cpp_2::list_llama_ggml_backend_devices();
+        for (i, dev) in devices.iter().enumerate() {
+            println!("Device {i:>2}: {}", dev.name);
+            println!("           Description: {}", dev.description);
+            println!("           Device Type: {:?}", dev.device_type);
+            println!("           Backend: {}", dev.backend);
+            println!(
+                "           Memory total: {:?} MiB",
+                dev.memory_total / 1024 / 1024
+            );
+            println!(
+                "           Memory free:  {:?} MiB",
+                dev.memory_free / 1024 / 1024
+            );
+        }
+    }
+
     // offload all layers to the gpu
-    let model_params = {
+    let mut model_params = {
         #[cfg(any(feature = "cuda", feature = "vulkan"))]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
@@ -163,6 +203,19 @@ fn main() -> Result<()> {
         LlamaModelParams::default()
     };
 
+    if let Some(devices) = devices {
+        model_params = model_params
+            .with_devices(&devices)
+            .with_context(|| "invalid device index in --devices")?;
+        if main_gpu.is_some() {
+            eprintln!("warning: --devices overrides --main-gpu");
+        }
+    } else if let Some(main_gpu) = main_gpu {
+        model_params = model_params.with_main_gpu(main_gpu);
+        // Enable single GPU mode
+        model_params = model_params.with_split_mode(LlamaSplitMode::None);
+    }
+
     let prompt = if let Some(str) = prompt {
         if file.is_some() {
             bail!("either prompt or file must be specified, but not both")
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
@@ -66,6 +66,12 @@ pub enum LLamaCppError {
     #[error(transparent)]
     EmbeddingError(#[from] EmbeddingsError),
     // See [`LlamaSamplerError`]
+    /// Backend device not found
+    #[error("Backend device {0} not found")]
+    BackendDeviceNotFound(usize),
+    /// Max devices exceeded
+    #[error("Max devices exceeded. Max devices is {0}")]
+    MaxDevicesExceeded(usize),
 }
 
 /// There was an error while getting the chat template from a model.
@@ -349,6 +355,91 @@ pub fn llama_supports_mlock() -> bool {
     unsafe { llama_cpp_sys_2::llama_supports_mlock() }
 }
 
+/// Backend device type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum LlamaBackendDeviceType {
+    /// CPU device
+    Cpu,
+    /// ACCEL device
+    Accelerator,
+    /// GPU device
+    Gpu,
+    /// iGPU device
+    IntegratedGpu,
+    /// Unknown device type
+    Unknown,
+}
+
+/// A ggml backend device
+///
+/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+#[derive(Debug, Clone)]
+pub struct LlamaBackendDevice {
+    /// The index of the device
+    ///
+    /// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
+    pub index: usize,
+    /// The name of the device (e.g. "Vulkan0")
+    pub name: String,
+    /// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
+    pub description: String,
+    /// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
+    pub backend: String,
+    /// Total memory of the device in bytes
+    pub memory_total: usize,
+    /// Free memory of the device in bytes
+    pub memory_free: usize,
+    /// Device type
+    pub device_type: LlamaBackendDeviceType,
+}
+
+/// List ggml backend devices
+#[must_use]
+pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
+    let mut devices = Vec::new();
+    for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
+        fn cstr_to_string(ptr: *const i8) -> String {
+            if ptr.is_null() {
+                String::new()
+            } else {
+                unsafe { std::ffi::CStr::from_ptr(ptr) }
+                    .to_string_lossy()
+                    .to_string()
+            }
+        }
+        let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
+        let props = unsafe {
+            let mut props = std::mem::zeroed();
+            llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
+            props
+        };
+        let name = cstr_to_string(props.name);
+        let description = cstr_to_string(props.description);
+        let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
+        let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
+        let backend = cstr_to_string(backend_name);
+        let memory_total = props.memory_total;
+        let memory_free = props.memory_free;
+        let device_type = match props.type_ {
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
+            llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
+            _ => LlamaBackendDeviceType::Unknown,
+        };
+        devices.push(LlamaBackendDevice {
+            index: i,
+            name,
+            description,
+            backend,
+            memory_total,
+            memory_free,
+            device_type,
+        });
+    }
+    devices
+}
+
 /// Options to configure how llama.cpp logs are intercepted.
 #[derive(Default, Debug, Clone)]
 pub struct LogOptions {
diff --git a/llama-cpp-2/src/model/params.rs b/llama-cpp-2/src/model/params.rs