-
Notifications
You must be signed in to change notification settings - Fork 6.8k
Fix GPU metrics #56009
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix GPU metrics #56009
Changes from 8 commits
f8fdc58
ced201e
a13e461
18903ca
b90f9fe
aea6cd0
86c3429
362a026
a773224
acbfa7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -893,7 +893,7 @@ def _get_agent_proc(self) -> psutil.Process: | |
def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]: | ||
return (proc.pid, proc.create_time()) | ||
|
||
def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): | ||
def _get_worker_processes(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this change is for test mocking |
||
raylet_proc = self._get_raylet_proc() | ||
if raylet_proc is None: | ||
return [] | ||
|
@@ -910,7 +910,13 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): | |
self._generate_worker_key(proc): proc | ||
for proc in raylet_proc.children() | ||
} | ||
return workers | ||
|
||
def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): | ||
workers = self._get_worker_processes() | ||
if not workers: | ||
return [] | ||
else: | ||
# We should keep `raylet_proc.children()` in `self` because | ||
# when `cpu_percent` is first called, it returns the meaningless 0. | ||
# See more: https://github.com/ray-project/ray/issues/29848 | ||
|
@@ -937,7 +943,7 @@ def _get_workers(self, gpus: Optional[List[GpuUtilizationInfo]] = None): | |
processes = gpu.get("processes_pids") | ||
if processes: | ||
for proc in processes.values(): | ||
alanwguo marked this conversation as resolved.
Show resolved
Hide resolved
|
||
gpu_pid_mapping[proc.pid].append(proc) | ||
gpu_pid_mapping[proc["pid"]].append(proc) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. qq: is there a test we can add to actually hit this code path There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will update |
||
|
||
result = [] | ||
for w in self._workers.values(): | ||
|
@@ -1763,6 +1769,15 @@ def _compose_stats_payload( | |
|
||
self._metrics_agent.clean_all_dead_worker_metrics() | ||
|
||
# Convert processes_pids back to a list of dictionaries to maintain backwards-compatibility | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [1] is using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. comment here |
||
for gpu in stats["gpus"]: | ||
if isinstance(gpu.get("processes_pids"), dict): | ||
gpu["processes_pids"] = list(gpu["processes_pids"].values()) | ||
|
||
# TODO(aguo): Add a pydantic model for this dict to maintain compatibility | ||
# with the Ray Dashboard API and UI code. | ||
|
||
# NOTE: This converts keys to "Google style", (e.g: "processes_pids" -> "processesPids") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @can-anyscale added comment here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. look at the implementation of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's confusing and unnecessary but it's too late to change. It would be massively backwards-incompatible to remove this conversion. |
||
return jsonify_asdict(stats) | ||
|
||
async def run(self, server): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[1]