Skip to content

Commit 3fe06a9

Browse files
authored
Fix gpu metrics (#56006)
cherrypick #56009 Bugs introduced in #52102, Two bugs: - proc is a TypedDict so it needs to be fetched via `proc["pid"]` instead of `proc.pid`. - Changing `processes_pid` is backwards-incompatible change that ends up changing the dashboard APIs that power the ray dashboard. Maintain backwards-compatibility Verified fix: Metrics work again: <img width="947" height="441" alt="Screenshot 2025-08-27 at 12 22 40 PM" src="https://github.com/user-attachments/assets/0a9a83e7-b720-4ad0-b90e-1baa394edde5" /> Ray Dashboard works again: <img width="1824" height="1029" alt="Screenshot 2025-08-27 at 12 21 51 PM" src="https://github.com/user-attachments/assets/6b0e08e4-69c9-4223-b736-ff69b8d306db" /> --------- Signed-off-by: Alan Guo <[email protected]>
1 parent 64649b2 commit 3fe06a9

File tree

4 files changed

+24
-6
lines changed

4 files changed

+24
-6
lines changed

python/ray/dashboard/modules/node/datacenter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ async def _get_actor_info(actor: Optional[dict]) -> Optional[dict]:
225225
break
226226

227227
for gpu_stats in node_physical_stats.get("gpus", []):
228-
# gpu_stats.get("processes") can be None, an empty list or a
228+
# gpu_stats.get("processesPids") can be None, an empty list or a
229229
# list of dictionaries.
230230
for process in gpu_stats.get("processesPids") or []:
231231
if process["pid"] == pid:

python/ray/dashboard/modules/reporter/gpu_providers.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,9 +230,9 @@ def _parse_nvsmi_pmon_output(
230230
1 7175 C 86 26 - - - - ray::TorchGPUWo
231231
2 - - - - - - - - -
232232
233-
Returns a dict mapping GPU index to list of ProcessGPUInfo.
233+
Returns a dict mapping GPU index to dict of pid to ProcessGPUInfo.
234234
"""
235-
process_utilizations = defaultdict(list)
235+
process_utilizations = defaultdict(dict)
236236
lines = nvsmi_stdout.splitlines()
237237
# Get the first line that is started with #
238238
table_header = None
@@ -275,7 +275,7 @@ def _parse_nvsmi_pmon_output(
275275
), # Convert percentage to MB
276276
gpu_utilization=sm,
277277
)
278-
process_utilizations[gpu_id].append(process_info)
278+
process_utilizations[gpu_id][pid] = process_info
279279
return process_utilizations
280280

281281
def _get_pynvml_gpu_usage(self) -> List[GpuUtilizationInfo]:

python/ray/dashboard/modules/reporter/reporter_agent.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ def _get_agent_proc(self) -> psutil.Process:
882882
def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]:
883883
return (proc.pid, proc.create_time())
884884

885-
def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
885+
def _get_worker_processes(self):
886886
raylet_proc = self._get_raylet_proc()
887887
if raylet_proc is None:
888888
return []
@@ -899,7 +899,13 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
899899
self._generate_worker_key(proc): proc
900900
for proc in raylet_proc.children()
901901
}
902+
return workers
902903

904+
def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
905+
workers = self._get_worker_processes()
906+
if not workers:
907+
return []
908+
else:
903909
# We should keep `raylet_proc.children()` in `self` because
904910
# when `cpu_percent` is first called, it returns the meaningless 0.
905911
# See more: https://github.com/ray-project/ray/issues/29848
@@ -926,7 +932,7 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None):
926932
processes = gpu.get("processes_pids")
927933
if processes:
928934
for proc in processes.values():
929-
gpu_pid_mapping[proc.pid].append(proc)
935+
gpu_pid_mapping[proc["pid"]].append(proc)
930936

931937
result = []
932938
for w in self._workers.values():
@@ -1752,6 +1758,15 @@ def _compose_stats_payload(
17521758

17531759
self._metrics_agent.clean_all_dead_worker_metrics()
17541760

1761+
# Convert processes_pids back to a list of dictionaries to maintain backwards-compatibility
1762+
for gpu in stats["gpus"]:
1763+
if isinstance(gpu.get("processes_pids"), dict):
1764+
gpu["processes_pids"] = list(gpu["processes_pids"].values())
1765+
1766+
# TODO(aguo): Add a pydantic model for this dict to maintain compatibility
1767+
# with the Ray Dashboard API and UI code.
1768+
1769+
# NOTE: This converts keys to "Google style", (e.g: "processes_pids" -> "processesPids")
17551770
return jsonify_asdict(stats)
17561771

17571772
async def run(self, server):

python/ray/dashboard/modules/reporter/tests/test_reporter.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,9 @@ def _get_agent_proc(self):
10171017
def _generate_worker_key(self, proc):
10181018
return (proc.pid, proc.create_time())
10191019

1020+
def _get_worker_processes(self):
1021+
return ReporterAgent._get_worker_processes(self)
1022+
10201023
obj = ReporterAgentDummy()
10211024

10221025
try:

0 commit comments

Comments
 (0)