Skip to content

Commit 69c9203

Browse files
Bye-legumeszhaoch23jjyao
authored andcommitted
[Dashboard] Add GPU component usage (#52102)
Signed-off-by: zhilong <[email protected]> Signed-off-by: zhaoch23 <[email protected]> Signed-off-by: zhilong <[email protected]> Co-authored-by: zhaoch23 <[email protected]> Co-authored-by: Jiajun Yao <[email protected]> Signed-off-by: Kamil Kaczmarek <[email protected]>
1 parent ec12a3a commit 69c9203

File tree

9 files changed

+479
-41
lines changed

9 files changed

+479
-41
lines changed

ci/lint/pydoclint-baseline.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,6 @@ python/ray/dashboard/modules/reporter/profile_manager.py
955955
DOC111: Method `MemoryProfilingManager.detach_profiler`: The option `--arg-type-hints-in-docstring` is `False` but there are type hints in the docstring arg list
956956
--------------------
957957
python/ray/dashboard/modules/reporter/reporter_agent.py
958-
DOC103: Method `ReporterAgent.generate_worker_stats_record`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [worker_stats: List[dict]]. Arguments in the docstring but not in the function signature: [stats: ].
959958
DOC201: Method `ReporterAgent.generate_worker_stats_record` does not have a return section in docstring
960959
--------------------
961960
python/ray/dashboard/modules/reporter/reporter_head.py

python/ray/_private/ray_constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,11 @@ def gcs_actor_scheduling_enabled():
572572
# WorkerId will be removed from all metrics.
573573
RAY_METRIC_CARDINALITY_LEVEL = os.environ.get("RAY_metric_cardinality_level", "legacy")
574574

575+
# Whether GPU metrics collection via `nvidia-smi` is enabled.
576+
# Controlled by the environment variable `RAY_metric_enable_gpu_nvsmi`.
577+
# Defaults to False to use pynvml to collect usage.
578+
RAY_METRIC_ENABLE_GPU_NVSMI = env_bool("RAY_metric_enable_gpu_nvsmi", False)
579+
575580
# Whether enable OpenTelemetry as the metrics collection backend on the driver
576581
# component. This flag is only used during the migration of the metric collection
577582
# backend from OpenCensus to OpenTelemetry. It will be removed in the future.

python/ray/dashboard/client/src/pages/metrics/Metrics.tsx

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,14 @@ const METRICS_CONFIG: MetricsSectionConfig[] = [
201201
title: "Node Memory by Component",
202202
pathParams: "theme=light&panelId=34",
203203
},
204+
{
205+
title: "Node GPU by Component",
206+
pathParams: "orgId=1&theme=light&panelId=45",
207+
},
208+
{
209+
title: "Node GPU Memory by Component",
210+
pathParams: "orgId=1&theme=light&panelId=46",
211+
},
204212
],
205213
},
206214
];

python/ray/dashboard/consts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
TPU_TAG_KEYS = NODE_TAG_KEYS + ["TpuDeviceName", "TpuIndex", "TpuType", "TpuTopology"]
7575
CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
7676
COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]
77+
COMPONENT_GPU_TAG_KEYS = GPU_TAG_KEYS + COMPONENT_METRICS_TAG_KEYS
7778

7879
# Dashboard metrics are tracked separately at the dashboard. TODO(sang): Support GCS.
7980
# Note that for dashboard subprocess module, the component name is "dashboard_[module_name]".

python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,34 @@ def max_plus_pending(max_resource, pending_resource):
508508
fill=0,
509509
stack=False,
510510
),
511+
Panel(
512+
id=45,
513+
title="Node GPU by Component",
514+
description="The physical (hardware) GPU usage across the cluster, broken down by component. This reports the summed GPU usage per Ray component.",
515+
unit="GPUs",
516+
targets=[
517+
Target(
518+
expr="sum(ray_component_gpu_percentage{{{global_filters}}} / 100) by (Component)",
519+
legend="{{Component}}",
520+
),
521+
],
522+
),
523+
Panel(
524+
id=46,
525+
title="Node GPU Memory by Component",
526+
description="The physical (hardware) GPU memory usage across the cluster, broken down by component. This reports the summed GPU memory usage per Ray component.",
527+
unit="bytes",
528+
targets=[
529+
Target(
530+
expr="sum(ray_component_gpu_memory_mb{{{global_filters}}}) by (Component)",
531+
legend="{{Component}}",
532+
),
533+
Target(
534+
expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}))*1024*1024',
535+
legend="MAX",
536+
),
537+
],
538+
),
511539
]
512540

513541

python/ray/dashboard/modules/reporter/gpu_providers.py

Lines changed: 145 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
import enum
99
import logging
1010
import subprocess
11-
from typing import List, Optional, Union, TypedDict
11+
from typing import Dict, List, Optional, Union, TypedDict
12+
from collections import defaultdict
13+
14+
from ray._private.ray_constants import RAY_METRIC_ENABLE_GPU_NVSMI
1215

1316
logger = logging.getLogger(__name__)
1417

@@ -33,6 +36,7 @@ class ProcessGPUInfo(TypedDict):
3336

3437
pid: int
3538
gpu_memory_usage: Megabytes
39+
gpu_utilization: Optional[Percentage]
3640

3741

3842
class GpuUtilizationInfo(TypedDict):
@@ -44,7 +48,7 @@ class GpuUtilizationInfo(TypedDict):
4448
utilization_gpu: Optional[Percentage]
4549
memory_used: Megabytes
4650
memory_total: Megabytes
47-
processes_pids: Optional[List[ProcessGPUInfo]]
51+
processes_pids: Optional[Dict[int, ProcessGPUInfo]]
4852

4953

5054
# tpu utilization for google tpu
@@ -105,6 +109,7 @@ class NvidiaGpuProvider(GpuProvider):
105109
def __init__(self):
106110
super().__init__()
107111
self._pynvml = None
112+
self._using_nvidia_smi = RAY_METRIC_ENABLE_GPU_NVSMI
108113

109114
def get_provider_name(self) -> GpuProviderType:
110115
return GpuProviderType.NVIDIA
@@ -149,6 +154,131 @@ def _shutdown(self):
149154

150155
def get_gpu_utilization(self) -> List[GpuUtilizationInfo]:
151156
"""Get GPU utilization information for all NVIDIA GPUs and MIG devices."""
157+
158+
return (
159+
self._get_nvsmi_gpu_usage()
160+
if self._using_nvidia_smi
161+
else self._get_pynvml_gpu_usage()
162+
)
163+
164+
def _get_nvsmi_gpu_usage(self) -> List[GpuUtilizationInfo]:
165+
try:
166+
gpu_info = subprocess.run(
167+
[
168+
"nvidia-smi",
169+
"--query-gpu=index,name,uuid,utilization.gpu,memory.used,memory.total",
170+
"--format=csv,noheader,nounits",
171+
],
172+
check=True,
173+
capture_output=True,
174+
text=True,
175+
)
176+
"""Sample output:
177+
0, GPU-0, GPU-36e1567d-37ed-051e-f8ff-df807517b396, 0, 73348, 81559
178+
1, GPU-1, GPU-4a2c89ef-1b3d-492c-a8d5-e9c614f82d73, 0, 73444, 81559
179+
2, GPU-2, GPU-7f15d234-9c6a-4e8b-b3f2-c982a5d91b48, 0, 73444, 81559
180+
3, GPU-3, GPU-2b8d6f91-5e4c-47a3-96d7-8b31c4f9ae52, 0, 73332, 81559
181+
4, GPU-4, GPU-9d3a7c82-6b5f-4d1e-ae94-3f5c8d2e9b14, 0, 73344, 81559
182+
5, GPU-5, GPU-c4e6b853-2a9d-48f6-b1c7-d4f982e6a795, 0, 73440, 81559
183+
6, GPU-6, GPU-1f9b4c75-8e3a-4d2b-95c8-6a7d3b8f4e21, 0, 73440, 81559
184+
7, GPU-7, GPU-5d2e9f36-4c7b-483a-b9e1-2f8ac4d5b963, 0, 73328, 81559
185+
"""
186+
gpus = []
187+
for line in sorted(gpu_info.stdout.strip().split("\n")): # Sort by index
188+
index, name, uuid, util, mem_used, mem_total = line.split(", ")
189+
gpus.append(
190+
GpuUtilizationInfo(
191+
index=int(index),
192+
name=name,
193+
uuid=uuid,
194+
utilization_gpu=int(util),
195+
memory_used=int(mem_used),
196+
memory_total=int(mem_total),
197+
processes_pids={},
198+
)
199+
)
200+
201+
processes_info = subprocess.run(
202+
["nvidia-smi", "pmon", "-c", "1"],
203+
stdout=subprocess.PIPE,
204+
stderr=subprocess.PIPE,
205+
check=True,
206+
text=True,
207+
)
208+
processes_info = self._parse_nvsmi_pmon_output(processes_info.stdout, gpus)
209+
for gpu in gpus:
210+
gpu_id = gpu["index"]
211+
if gpu_id in processes_info:
212+
gpu["processes_pids"] = processes_info[gpu_id]
213+
return gpus
214+
except (subprocess.CalledProcessError, ValueError) as e:
215+
logger.warning(f"nvidia-smi failed to call: {e}. Falling back to pynvml.")
216+
self._using_nvidia_smi = False
217+
return self._get_pynvml_gpu_usage()
218+
219+
@staticmethod
220+
def _parse_nvsmi_pmon_output(
221+
nvsmi_stdout: str,
222+
gpus: List[GpuUtilizationInfo],
223+
) -> Dict[int, List[ProcessGPUInfo]]:
224+
"""Parse the output of nvidia-smi pmon -c 1.
225+
226+
Sample output of 'nvidia-smi pmon -c 1':
227+
# gpu pid type sm mem enc dec jpg ofa command
228+
# Idx # C/G % % % % % % name
229+
0 7175 C 84 26 - - - - ray::TorchGPUWo
230+
1 7175 C 86 26 - - - - ray::TorchGPUWo
231+
2 - - - - - - - - -
232+
233+
Returns a dict mapping GPU index to list of ProcessGPUInfo.
234+
"""
235+
process_utilizations = defaultdict(list)
236+
lines = nvsmi_stdout.splitlines()
237+
# Get the first line that is started with #
238+
table_header = None
239+
for line in lines:
240+
if line.startswith("#"):
241+
table_header = line
242+
break
243+
if not table_header:
244+
raise ValueError(
245+
"nvidia-smi pmon output is not supported. Please upgrade to a newer version of nvidia-smi."
246+
)
247+
table_header = table_header.lower().split()[1:]
248+
# Base on different versions, the header may be different.
249+
# ValueError will be raised if the header is not found by the index function.
250+
gpu_id_index = table_header.index("gpu")
251+
pid_index = table_header.index("pid")
252+
sm_index = table_header.index("sm")
253+
mem_index = table_header.index("mem")
254+
255+
for line in lines:
256+
if line.startswith("#") or not line.strip():
257+
continue
258+
259+
columns = line.split()
260+
if len(columns) < max(gpu_id_index, pid_index, sm_index, mem_index) + 1:
261+
continue
262+
263+
gpu_id, pid, sm, mem = (
264+
int(columns[gpu_id_index]),
265+
0 if columns[pid_index] == "-" else int(columns[pid_index]),
266+
0 if columns[sm_index] == "-" else int(columns[sm_index]),
267+
0 if columns[mem_index] == "-" else int(columns[mem_index]),
268+
)
269+
if pid == 0: # no process on this GPU
270+
continue
271+
process_info = ProcessGPUInfo(
272+
pid=pid,
273+
gpu_memory_usage=int(
274+
gpus[gpu_id]["memory_total"] * mem / 100
275+
), # Convert percentage to MB
276+
gpu_utilization=sm,
277+
)
278+
process_utilizations[gpu_id].append(process_info)
279+
return process_utilizations
280+
281+
def _get_pynvml_gpu_usage(self) -> List[GpuUtilizationInfo]:
152282
if not self._initialized:
153283
if not self._initialize():
154284
return []
@@ -232,7 +362,7 @@ def _get_mig_device_info(
232362
logger.debug(f"Failed to retrieve MIG device utilization: {e}")
233363

234364
# Get running processes on MIG device
235-
processes_pids = []
365+
processes_pids = {}
236366
try:
237367
nv_comp_processes = self._pynvml.nvmlDeviceGetComputeRunningProcesses(
238368
mig_handle
@@ -241,17 +371,16 @@ def _get_mig_device_info(
241371
self._pynvml.nvmlDeviceGetGraphicsRunningProcesses(mig_handle)
242372
)
243373

244-
processes_pids = [
245-
ProcessGPUInfo(
374+
for nv_process in nv_comp_processes + nv_graphics_processes:
375+
processes_pids[int(nv_process.pid)] = ProcessGPUInfo(
246376
pid=int(nv_process.pid),
247377
gpu_memory_usage=(
248378
int(nv_process.usedGpuMemory) // MB
249379
if nv_process.usedGpuMemory
250380
else 0
251381
),
382+
gpu_utilization=None, # Not available in pynvml
252383
)
253-
for nv_process in (nv_comp_processes + nv_graphics_processes)
254-
]
255384
except self._pynvml.NVMLError as e:
256385
logger.debug(f"Failed to retrieve MIG device processes: {e}")
257386

@@ -303,7 +432,7 @@ def _get_gpu_info(self, gpu_handle, gpu_index: int) -> Optional[GpuUtilizationIn
303432
logger.debug(f"Failed to retrieve GPU utilization: {e}")
304433

305434
# Get running processes
306-
processes_pids = []
435+
processes_pids = {}
307436
try:
308437
nv_comp_processes = self._pynvml.nvmlDeviceGetComputeRunningProcesses(
309438
gpu_handle
@@ -312,17 +441,16 @@ def _get_gpu_info(self, gpu_handle, gpu_index: int) -> Optional[GpuUtilizationIn
312441
self._pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle)
313442
)
314443

315-
processes_pids = [
316-
ProcessGPUInfo(
444+
for nv_process in nv_comp_processes + nv_graphics_processes:
445+
processes_pids[int(nv_process.pid)] = ProcessGPUInfo(
317446
pid=int(nv_process.pid),
318447
gpu_memory_usage=(
319448
int(nv_process.usedGpuMemory) // MB
320449
if nv_process.usedGpuMemory
321450
else 0
322451
),
452+
gpu_utilization=None, # Not available in pynvml
323453
)
324-
for nv_process in (nv_comp_processes + nv_graphics_processes)
325-
]
326454
except self._pynvml.NVMLError as e:
327455
logger.debug(f"Failed to retrieve GPU processes: {e}")
328456

@@ -407,16 +535,15 @@ def get_gpu_utilization(self) -> List[GpuUtilizationInfo]:
407535
utilization = -1
408536

409537
# Get running processes
410-
processes_pids = []
538+
processes_pids = {}
411539
for process in self._pyamdsmi.smi_get_compute_process_info_by_device(
412540
i, processes
413541
):
414542
if process.vram_usage:
415-
processes_pids.append(
416-
ProcessGPUInfo(
417-
pid=int(process.process_id),
418-
gpu_memory_usage=int(process.vram_usage) // MB,
419-
)
543+
processes_pids[int(process.process_id)] = ProcessGPUInfo(
544+
pid=int(process.process_id),
545+
gpu_memory_usage=int(process.vram_usage) // MB,
546+
gpu_utilization=None,
420547
)
421548

422549
info = GpuUtilizationInfo(

0 commit comments

Comments
 (0)