Skip to content

Commit 9aa9771

Browse files
authored
Merge pull request #861 from kusaanko/selectable-backend-devices
Add ability to select backend devices
2 parents 266957a + 86b1996 commit 9aa9771

File tree

3 files changed

+332
-2
lines changed

3 files changed

+332
-2
lines changed

examples/simple/src/main.rs

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use llama_cpp_2::context::params::LlamaContextParams;
1313
use llama_cpp_2::llama_backend::LlamaBackend;
1414
use llama_cpp_2::llama_batch::LlamaBatch;
1515
use llama_cpp_2::model::params::kv_overrides::ParamOverrideValue;
16-
use llama_cpp_2::model::params::LlamaModelParams;
16+
use llama_cpp_2::model::params::{LlamaModelParams, LlamaSplitMode};
1717
use llama_cpp_2::model::LlamaModel;
1818
use llama_cpp_2::model::{AddBos, Special};
1919
use llama_cpp_2::sampling::LlamaSampler;
@@ -48,6 +48,23 @@ struct Args {
4848
#[cfg(any(feature = "cuda", feature = "vulkan"))]
4949
#[clap(long)]
5050
disable_gpu: bool,
51+
/// Set main GPU device index (default: 0)
52+
///
53+
/// By setting this option, multiple GPU is disabled.
54+
#[arg(
55+
long,
56+
help = "Set main GPU device id (default: 0). Disables multi-GPU."
57+
)]
58+
main_gpu: Option<i32>,
59+
/// Set devices to use by index
60+
///
61+
/// This option overrides `main-gpu` and enables multi-GPU.
62+
#[arg(
63+
long,
64+
value_delimiter = ',',
65+
help = "Set devices to use by index, separated by commas (e.g. --devices 0,1,2). Overrides main-gpu and enables multi-GPU."
66+
)]
67+
devices: Option<Vec<usize>>,
5168
#[cfg(any(feature = "cuda", feature = "vulkan"))]
5269
#[arg(long, help = "Keep MoE layers on CPU")]
5370
cmoe: bool,
@@ -72,6 +89,8 @@ struct Args {
7289
ctx_size: Option<NonZeroU32>,
7390
#[arg(short = 'v', long, help = "enable verbose llama.cpp logs")]
7491
verbose: bool,
92+
#[arg(long, help = "list backend devices")]
93+
list_devices: bool,
7594
}
7695

7796
/// Parse a single key-value pair
@@ -132,6 +151,8 @@ fn main() -> Result<()> {
132151
file,
133152
#[cfg(any(feature = "cuda", feature = "vulkan"))]
134153
disable_gpu,
154+
main_gpu,
155+
devices,
135156
#[cfg(any(feature = "cuda", feature = "vulkan"))]
136157
cmoe,
137158
key_value_overrides,
@@ -140,6 +161,7 @@ fn main() -> Result<()> {
140161
threads_batch,
141162
ctx_size,
142163
verbose,
164+
list_devices,
143165
} = Args::parse();
144166

145167
if verbose {
@@ -151,8 +173,26 @@ fn main() -> Result<()> {
151173
// init LLM
152174
let backend = LlamaBackend::init()?;
153175

176+
if list_devices {
177+
let devices = llama_cpp_2::list_llama_ggml_backend_devices();
178+
for (i, dev) in devices.iter().enumerate() {
179+
println!("Device {i:>2}: {}", dev.name);
180+
println!(" Description: {}", dev.description);
181+
println!(" Device Type: {:?}", dev.device_type);
182+
println!(" Backend: {}", dev.backend);
183+
println!(
184+
" Memory total: {:?} MiB",
185+
dev.memory_total / 1024 / 1024
186+
);
187+
println!(
188+
" Memory free: {:?} MiB",
189+
dev.memory_free / 1024 / 1024
190+
);
191+
}
192+
}
193+
154194
// offload all layers to the gpu
155-
let model_params = {
195+
let mut model_params = {
156196
#[cfg(any(feature = "cuda", feature = "vulkan"))]
157197
if !disable_gpu {
158198
LlamaModelParams::default().with_n_gpu_layers(1000)
@@ -163,6 +203,19 @@ fn main() -> Result<()> {
163203
LlamaModelParams::default()
164204
};
165205

206+
if let Some(devices) = devices {
207+
model_params = model_params
208+
.with_devices(&devices)
209+
.with_context(|| "invalid device index in --devices")?;
210+
if main_gpu.is_some() {
211+
eprintln!("warning: --devices overrides --main-gpu");
212+
}
213+
} else if let Some(main_gpu) = main_gpu {
214+
model_params = model_params.with_main_gpu(main_gpu);
215+
// Enable single GPU mode
216+
model_params = model_params.with_split_mode(LlamaSplitMode::None);
217+
}
218+
166219
let prompt = if let Some(str) = prompt {
167220
if file.is_some() {
168221
bail!("either prompt or file must be specified, but not both")

llama-cpp-2/src/lib.rs

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ pub enum LLamaCppError {
6666
#[error(transparent)]
6767
EmbeddingError(#[from] EmbeddingsError),
6868
// See [`LlamaSamplerError`]
69+
/// Backend device not found
70+
#[error("Backend device {0} not found")]
71+
BackendDeviceNotFound(usize),
72+
/// Max devices exceeded
73+
#[error("Max devices exceeded. Max devices is {0}")]
74+
MaxDevicesExceeded(usize),
6975
}
7076

7177
/// There was an error while getting the chat template from a model.
@@ -349,6 +355,91 @@ pub fn llama_supports_mlock() -> bool {
349355
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
350356
}
351357

358+
/// Backend device type
359+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
360+
pub enum LlamaBackendDeviceType {
361+
/// CPU device
362+
Cpu,
363+
/// ACCEL device
364+
Accelerator,
365+
/// GPU device
366+
Gpu,
367+
/// iGPU device
368+
IntegratedGpu,
369+
/// Unknown device type
370+
Unknown,
371+
}
372+
373+
/// A ggml backend device
374+
///
375+
/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
376+
#[derive(Debug, Clone)]
377+
pub struct LlamaBackendDevice {
378+
/// The index of the device
379+
///
380+
/// The index is can be used from `LlamaModelParams::with_devices` to select specific devices.
381+
pub index: usize,
382+
/// The name of the device (e.g. "Vulkan0")
383+
pub name: String,
384+
/// A description of the device (e.g. "NVIDIA GeForce RTX 3080")
385+
pub description: String,
386+
/// The backend of the device (e.g. "Vulkan", "CUDA", "CPU")
387+
pub backend: String,
388+
/// Total memory of the device in bytes
389+
pub memory_total: usize,
390+
/// Free memory of the device in bytes
391+
pub memory_free: usize,
392+
/// Device type
393+
pub device_type: LlamaBackendDeviceType,
394+
}
395+
396+
/// List ggml backend devices
397+
#[must_use]
398+
pub fn list_llama_ggml_backend_devices() -> Vec<LlamaBackendDevice> {
399+
let mut devices = Vec::new();
400+
for i in 0..unsafe { llama_cpp_sys_2::ggml_backend_dev_count() } {
401+
fn cstr_to_string(ptr: *const i8) -> String {
402+
if ptr.is_null() {
403+
String::new()
404+
} else {
405+
unsafe { std::ffi::CStr::from_ptr(ptr) }
406+
.to_string_lossy()
407+
.to_string()
408+
}
409+
}
410+
let dev = unsafe { llama_cpp_sys_2::ggml_backend_dev_get(i) };
411+
let props = unsafe {
412+
let mut props = std::mem::zeroed();
413+
llama_cpp_sys_2::ggml_backend_dev_get_props(dev, &raw mut props);
414+
props
415+
};
416+
let name = cstr_to_string(props.name);
417+
let description = cstr_to_string(props.description);
418+
let backend = unsafe { llama_cpp_sys_2::ggml_backend_dev_backend_reg(dev) };
419+
let backend_name = unsafe { llama_cpp_sys_2::ggml_backend_reg_name(backend) };
420+
let backend = cstr_to_string(backend_name);
421+
let memory_total = props.memory_total;
422+
let memory_free = props.memory_free;
423+
let device_type = match props.type_ {
424+
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_CPU => LlamaBackendDeviceType::Cpu,
425+
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_ACCEL => LlamaBackendDeviceType::Accelerator,
426+
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_GPU => LlamaBackendDeviceType::Gpu,
427+
llama_cpp_sys_2::GGML_BACKEND_DEVICE_TYPE_IGPU => LlamaBackendDeviceType::IntegratedGpu,
428+
_ => LlamaBackendDeviceType::Unknown,
429+
};
430+
devices.push(LlamaBackendDevice {
431+
index: i,
432+
name,
433+
description,
434+
backend,
435+
memory_total,
436+
memory_free,
437+
device_type,
438+
});
439+
}
440+
devices
441+
}
442+
352443
/// Options to configure how llama.cpp logs are intercepted.
353444
#[derive(Default, Debug, Clone)]
354445
pub struct LogOptions {

0 commit comments

Comments
 (0)