8
8
import enum
9
9
import logging
10
10
import subprocess
11
- from typing import List , Optional , Union , TypedDict
11
+ from typing import Dict , List , Optional , Union , TypedDict
12
+ from collections import defaultdict
13
+
14
+ from ray ._private .ray_constants import RAY_METRIC_ENABLE_GPU_NVSMI
12
15
13
16
logger = logging .getLogger (__name__ )
14
17
@@ -33,6 +36,7 @@ class ProcessGPUInfo(TypedDict):
33
36
34
37
pid : int
35
38
gpu_memory_usage : Megabytes
39
+ gpu_utilization : Optional [Percentage ]
36
40
37
41
38
42
class GpuUtilizationInfo (TypedDict ):
@@ -44,7 +48,7 @@ class GpuUtilizationInfo(TypedDict):
44
48
utilization_gpu : Optional [Percentage ]
45
49
memory_used : Megabytes
46
50
memory_total : Megabytes
47
- processes_pids : Optional [List [ ProcessGPUInfo ]]
51
+ processes_pids : Optional [Dict [ int , ProcessGPUInfo ]]
48
52
49
53
50
54
# tpu utilization for google tpu
@@ -105,6 +109,7 @@ class NvidiaGpuProvider(GpuProvider):
105
109
def __init__ (self ):
106
110
super ().__init__ ()
107
111
self ._pynvml = None
112
+ self ._using_nvidia_smi = RAY_METRIC_ENABLE_GPU_NVSMI
108
113
109
114
def get_provider_name (self ) -> GpuProviderType :
110
115
return GpuProviderType .NVIDIA
@@ -149,6 +154,131 @@ def _shutdown(self):
149
154
150
155
def get_gpu_utilization (self ) -> List [GpuUtilizationInfo ]:
151
156
"""Get GPU utilization information for all NVIDIA GPUs and MIG devices."""
157
+
158
+ return (
159
+ self ._get_nvsmi_gpu_usage ()
160
+ if self ._using_nvidia_smi
161
+ else self ._get_pynvml_gpu_usage ()
162
+ )
163
+
164
+ def _get_nvsmi_gpu_usage (self ) -> List [GpuUtilizationInfo ]:
165
+ try :
166
+ gpu_info = subprocess .run (
167
+ [
168
+ "nvidia-smi" ,
169
+ "--query-gpu=index,name,uuid,utilization.gpu,memory.used,memory.total" ,
170
+ "--format=csv,noheader,nounits" ,
171
+ ],
172
+ check = True ,
173
+ capture_output = True ,
174
+ text = True ,
175
+ )
176
+ """Sample output:
177
+ 0, GPU-0, GPU-36e1567d-37ed-051e-f8ff-df807517b396, 0, 73348, 81559
178
+ 1, GPU-1, GPU-4a2c89ef-1b3d-492c-a8d5-e9c614f82d73, 0, 73444, 81559
179
+ 2, GPU-2, GPU-7f15d234-9c6a-4e8b-b3f2-c982a5d91b48, 0, 73444, 81559
180
+ 3, GPU-3, GPU-2b8d6f91-5e4c-47a3-96d7-8b31c4f9ae52, 0, 73332, 81559
181
+ 4, GPU-4, GPU-9d3a7c82-6b5f-4d1e-ae94-3f5c8d2e9b14, 0, 73344, 81559
182
+ 5, GPU-5, GPU-c4e6b853-2a9d-48f6-b1c7-d4f982e6a795, 0, 73440, 81559
183
+ 6, GPU-6, GPU-1f9b4c75-8e3a-4d2b-95c8-6a7d3b8f4e21, 0, 73440, 81559
184
+ 7, GPU-7, GPU-5d2e9f36-4c7b-483a-b9e1-2f8ac4d5b963, 0, 73328, 81559
185
+ """
186
+ gpus = []
187
+ for line in sorted (gpu_info .stdout .strip ().split ("\n " )): # Sort by index
188
+ index , name , uuid , util , mem_used , mem_total = line .split (", " )
189
+ gpus .append (
190
+ GpuUtilizationInfo (
191
+ index = int (index ),
192
+ name = name ,
193
+ uuid = uuid ,
194
+ utilization_gpu = int (util ),
195
+ memory_used = int (mem_used ),
196
+ memory_total = int (mem_total ),
197
+ processes_pids = {},
198
+ )
199
+ )
200
+
201
+ processes_info = subprocess .run (
202
+ ["nvidia-smi" , "pmon" , "-c" , "1" ],
203
+ stdout = subprocess .PIPE ,
204
+ stderr = subprocess .PIPE ,
205
+ check = True ,
206
+ text = True ,
207
+ )
208
+ processes_info = self ._parse_nvsmi_pmon_output (processes_info .stdout , gpus )
209
+ for gpu in gpus :
210
+ gpu_id = gpu ["index" ]
211
+ if gpu_id in processes_info :
212
+ gpu ["processes_pids" ] = processes_info [gpu_id ]
213
+ return gpus
214
+ except (subprocess .CalledProcessError , ValueError ) as e :
215
+ logger .warning (f"nvidia-smi failed to call: { e } . Falling back to pynvml." )
216
+ self ._using_nvidia_smi = False
217
+ return self ._get_pynvml_gpu_usage ()
218
+
219
+ @staticmethod
220
+ def _parse_nvsmi_pmon_output (
221
+ nvsmi_stdout : str ,
222
+ gpus : List [GpuUtilizationInfo ],
223
+ ) -> Dict [int , List [ProcessGPUInfo ]]:
224
+ """Parse the output of nvidia-smi pmon -c 1.
225
+
226
+ Sample output of 'nvidia-smi pmon -c 1':
227
+ # gpu pid type sm mem enc dec jpg ofa command
228
+ # Idx # C/G % % % % % % name
229
+ 0 7175 C 84 26 - - - - ray::TorchGPUWo
230
+ 1 7175 C 86 26 - - - - ray::TorchGPUWo
231
+ 2 - - - - - - - - -
232
+
233
+ Returns a dict mapping GPU index to list of ProcessGPUInfo.
234
+ """
235
+ process_utilizations = defaultdict (list )
236
+ lines = nvsmi_stdout .splitlines ()
237
+ # Get the first line that is started with #
238
+ table_header = None
239
+ for line in lines :
240
+ if line .startswith ("#" ):
241
+ table_header = line
242
+ break
243
+ if not table_header :
244
+ raise ValueError (
245
+ "nvidia-smi pmon output is not supported. Please upgrade to a newer version of nvidia-smi."
246
+ )
247
+ table_header = table_header .lower ().split ()[1 :]
248
+ # Base on different versions, the header may be different.
249
+ # ValueError will be raised if the header is not found by the index function.
250
+ gpu_id_index = table_header .index ("gpu" )
251
+ pid_index = table_header .index ("pid" )
252
+ sm_index = table_header .index ("sm" )
253
+ mem_index = table_header .index ("mem" )
254
+
255
+ for line in lines :
256
+ if line .startswith ("#" ) or not line .strip ():
257
+ continue
258
+
259
+ columns = line .split ()
260
+ if len (columns ) < max (gpu_id_index , pid_index , sm_index , mem_index ) + 1 :
261
+ continue
262
+
263
+ gpu_id , pid , sm , mem = (
264
+ int (columns [gpu_id_index ]),
265
+ 0 if columns [pid_index ] == "-" else int (columns [pid_index ]),
266
+ 0 if columns [sm_index ] == "-" else int (columns [sm_index ]),
267
+ 0 if columns [mem_index ] == "-" else int (columns [mem_index ]),
268
+ )
269
+ if pid == 0 : # no process on this GPU
270
+ continue
271
+ process_info = ProcessGPUInfo (
272
+ pid = pid ,
273
+ gpu_memory_usage = int (
274
+ gpus [gpu_id ]["memory_total" ] * mem / 100
275
+ ), # Convert percentage to MB
276
+ gpu_utilization = sm ,
277
+ )
278
+ process_utilizations [gpu_id ].append (process_info )
279
+ return process_utilizations
280
+
281
+ def _get_pynvml_gpu_usage (self ) -> List [GpuUtilizationInfo ]:
152
282
if not self ._initialized :
153
283
if not self ._initialize ():
154
284
return []
@@ -232,7 +362,7 @@ def _get_mig_device_info(
232
362
logger .debug (f"Failed to retrieve MIG device utilization: { e } " )
233
363
234
364
# Get running processes on MIG device
235
- processes_pids = []
365
+ processes_pids = {}
236
366
try :
237
367
nv_comp_processes = self ._pynvml .nvmlDeviceGetComputeRunningProcesses (
238
368
mig_handle
@@ -241,17 +371,16 @@ def _get_mig_device_info(
241
371
self ._pynvml .nvmlDeviceGetGraphicsRunningProcesses (mig_handle )
242
372
)
243
373
244
- processes_pids = [
245
- ProcessGPUInfo (
374
+ for nv_process in nv_comp_processes + nv_graphics_processes :
375
+ processes_pids [ int ( nv_process . pid )] = ProcessGPUInfo (
246
376
pid = int (nv_process .pid ),
247
377
gpu_memory_usage = (
248
378
int (nv_process .usedGpuMemory ) // MB
249
379
if nv_process .usedGpuMemory
250
380
else 0
251
381
),
382
+ gpu_utilization = None , # Not available in pynvml
252
383
)
253
- for nv_process in (nv_comp_processes + nv_graphics_processes )
254
- ]
255
384
except self ._pynvml .NVMLError as e :
256
385
logger .debug (f"Failed to retrieve MIG device processes: { e } " )
257
386
@@ -303,7 +432,7 @@ def _get_gpu_info(self, gpu_handle, gpu_index: int) -> Optional[GpuUtilizationIn
303
432
logger .debug (f"Failed to retrieve GPU utilization: { e } " )
304
433
305
434
# Get running processes
306
- processes_pids = []
435
+ processes_pids = {}
307
436
try :
308
437
nv_comp_processes = self ._pynvml .nvmlDeviceGetComputeRunningProcesses (
309
438
gpu_handle
@@ -312,17 +441,16 @@ def _get_gpu_info(self, gpu_handle, gpu_index: int) -> Optional[GpuUtilizationIn
312
441
self ._pynvml .nvmlDeviceGetGraphicsRunningProcesses (gpu_handle )
313
442
)
314
443
315
- processes_pids = [
316
- ProcessGPUInfo (
444
+ for nv_process in nv_comp_processes + nv_graphics_processes :
445
+ processes_pids [ int ( nv_process . pid )] = ProcessGPUInfo (
317
446
pid = int (nv_process .pid ),
318
447
gpu_memory_usage = (
319
448
int (nv_process .usedGpuMemory ) // MB
320
449
if nv_process .usedGpuMemory
321
450
else 0
322
451
),
452
+ gpu_utilization = None , # Not available in pynvml
323
453
)
324
- for nv_process in (nv_comp_processes + nv_graphics_processes )
325
- ]
326
454
except self ._pynvml .NVMLError as e :
327
455
logger .debug (f"Failed to retrieve GPU processes: { e } " )
328
456
@@ -407,16 +535,15 @@ def get_gpu_utilization(self) -> List[GpuUtilizationInfo]:
407
535
utilization = - 1
408
536
409
537
# Get running processes
410
- processes_pids = []
538
+ processes_pids = {}
411
539
for process in self ._pyamdsmi .smi_get_compute_process_info_by_device (
412
540
i , processes
413
541
):
414
542
if process .vram_usage :
415
- processes_pids .append (
416
- ProcessGPUInfo (
417
- pid = int (process .process_id ),
418
- gpu_memory_usage = int (process .vram_usage ) // MB ,
419
- )
543
+ processes_pids [int (process .process_id )] = ProcessGPUInfo (
544
+ pid = int (process .process_id ),
545
+ gpu_memory_usage = int (process .vram_usage ) // MB ,
546
+ gpu_utilization = None ,
420
547
)
421
548
422
549
info = GpuUtilizationInfo (
0 commit comments