Skip to content

Commit f526590

Browse files
authored
Add Graph Execution-Order Diagnostics to All-Ranks Landing Page (#145)
**PR Summary** This change integrates the execution-order report (built in #140) into the --all-ranks-html landing page, surfacing a new “Graph Execution-Order Diagnostics” section. The page now highlights whether graph execution order differs across ranks and summarizes which ranks show collective schedule mismatches or cache hit/miss divergences, reusing existing divergence checks without introducing any CLI changes. <img width="1708" height="676" alt="Screenshot 2025-08-27 at 10 04 37 PM" src="https://github.com/user-attachments/assets/a057c49c-178e-45bc-a251-f4eaba78e214" />
1 parent 5938ed9 commit f526590

File tree

7 files changed

+15304
-6
lines changed

7 files changed

+15304
-6
lines changed

src/cli.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use std::path::PathBuf;
66

77
use fxhash::{FxHashMap, FxHashSet};
88
use tlparse::{
9-
analyze_graph_runtime_deltas, generate_multi_rank_html, parse_path,
9+
analyze_graph_runtime_deltas, build_exec_order_summary, generate_multi_rank_html, parse_path,
1010
read_chromium_events_with_pid, ArtifactFlags, Diagnostics, DivergenceFlags, DivergenceGroup,
1111
ParseConfig, RankMetaData,
1212
};
@@ -549,6 +549,8 @@ fn handle_all_ranks(
549549
out_path.display()
550550
);
551551

552+
let exec_order_summary = build_exec_order_summary(&out_path, &rank_nums, &collective_schedules);
553+
552554
let diagnostics = Diagnostics {
553555
divergence: DivergenceFlags {
554556
cache: cache_seq_groups.len() > 1,
@@ -562,6 +564,7 @@ fn handle_all_ranks(
562564
cache_groups: cache_divergence_groups.clone(),
563565
collective_groups: collective_divergence_groups.clone(),
564566
tensor_meta_groups: tensor_meta_divergence_groups.clone(),
567+
exec_order: exec_order_summary,
565568
};
566569

567570
let (landing_page_path, landing_html) = generate_multi_rank_html(

src/lib.rs

Lines changed: 219 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ mod templates;
2525
mod types;
2626

2727
pub use types::{
28-
ArtifactFlags, CollectivesParityReport, Diagnostics, DivergenceFlags, DivergenceGroup,
29-
GraphAnalysis, GraphCollectivesParity, GraphRuntime, RankMetaData, RuntimeAnalysis,
30-
RuntimeRankDetail,
28+
ArtifactFlags, CollectiveSchedule, CollectivesParityReport, Diagnostics, DivergenceFlags,
29+
DivergenceGroup, ExecOrderSummary, GraphAnalysis, GraphCollectivesParity, GraphRuntime,
30+
RankMetaData, RuntimeAnalysis, RuntimeRankDetail,
3131
};
3232

3333
pub use execution_order::{
@@ -1290,6 +1290,220 @@ pub fn generate_multi_rank_html(
12901290
Ok((landing_page_path, html))
12911291
}
12921292

1293+
/// Build ExecOrderSummary from artifacts under out_path for the given ranks
1294+
pub fn build_exec_order_summary(
1295+
out_path: &PathBuf,
1296+
rank_nums: &[u32],
1297+
collective_schedules: &[CollectiveSchedule],
1298+
) -> Option<ExecOrderSummary> {
1299+
use crate::execution_order::{
1300+
analyze_execution_order, parse_graph_execution_order, ExecOrderIssue,
1301+
};
1302+
use std::collections::HashSet;
1303+
1304+
// Preload and parse compile_directory.json per rank
1305+
let cd_by_rank: FxHashMap<u32, serde_json::Map<String, serde_json::Value>> = rank_nums
1306+
.iter()
1307+
.filter_map(|&rank| {
1308+
let path = out_path
1309+
.join(format!("rank_{rank}"))
1310+
.join("compile_directory.json");
1311+
std::fs::read_to_string(path)
1312+
.ok()
1313+
.and_then(|content| serde_json::from_str::<serde_json::Value>(&content).ok())
1314+
.and_then(|val| match val {
1315+
serde_json::Value::Object(map) => Some((rank, map)),
1316+
_ => None,
1317+
})
1318+
})
1319+
.collect();
1320+
1321+
// Collect latest "graph_execution" artifact per rank
1322+
let exec_orders: FxHashMap<u32, Vec<String>> = rank_nums
1323+
.iter()
1324+
.filter_map(|&rank| {
1325+
let map = cd_by_rank.get(&rank)?;
1326+
let rank_dir = out_path.join(format!("rank_{rank}"));
1327+
1328+
// Find latest graph_execution artifact
1329+
let best = map
1330+
.values()
1331+
.filter_map(|entry| entry.get("artifacts")?.as_array())
1332+
.flat_map(|arts| arts.iter())
1333+
.filter_map(|a| {
1334+
let name = a.get("name")?.as_str()?;
1335+
if !name.contains("graph_execution") || !name.ends_with(".json") {
1336+
return None;
1337+
}
1338+
Some((
1339+
a.get("number")?.as_u64()?,
1340+
a.get("url")?.as_str()?.to_string(),
1341+
))
1342+
})
1343+
.max_by_key(|(num, _)| *num)?;
1344+
1345+
let path = rank_dir.join(best.1);
1346+
let order = std::fs::read_to_string(path)
1347+
.ok()
1348+
.and_then(|payload| parse_graph_execution_order(&payload).ok())
1349+
.map(|order| order.into_iter().map(|s| format!("[{}]", s)).collect())?;
1350+
Some((rank, order))
1351+
})
1352+
.collect();
1353+
1354+
if exec_orders.len() < 2 {
1355+
return None;
1356+
}
1357+
1358+
// Build dir -> compile_id mapping per rank
1359+
let dir_to_compile_id_per_rank: FxHashMap<u32, FxHashMap<String, String>> = rank_nums
1360+
.iter()
1361+
.filter_map(|&rank| {
1362+
let obj = cd_by_rank.get(&rank)?;
1363+
let mapping = obj
1364+
.iter()
1365+
.flat_map(|(cid, entry)| {
1366+
entry
1367+
.get("artifacts")
1368+
.and_then(|x| x.as_array())
1369+
.map(|arts| {
1370+
arts.iter()
1371+
.filter_map(|a| {
1372+
let url = a.get("url")?.as_str()?;
1373+
let prefix = url.split_once('/')?.0;
1374+
Some((prefix.to_string(), cid.to_string()))
1375+
})
1376+
.collect::<Vec<_>>()
1377+
})
1378+
.unwrap_or_default()
1379+
})
1380+
.fold(FxHashMap::default(), |mut acc, (prefix, cid)| {
1381+
acc.entry(prefix).or_insert(cid);
1382+
acc
1383+
});
1384+
Some((rank, mapping))
1385+
})
1386+
.collect();
1387+
1388+
// Build collective ops mapping
1389+
let collective_by_graph: FxHashMap<(u32, String), Vec<String>> = collective_schedules
1390+
.iter()
1391+
.filter_map(|cs| {
1392+
let m = dir_to_compile_id_per_rank.get(&cs.rank)?;
1393+
let compile_id = m
1394+
.get(&cs.graph)
1395+
.cloned()
1396+
.unwrap_or_else(|| cs.graph.clone());
1397+
Some(((cs.rank, compile_id), cs.ops.clone()))
1398+
})
1399+
.fold(FxHashMap::default(), |mut acc, (key, ops)| {
1400+
acc.entry(key).or_default().extend(ops);
1401+
acc
1402+
});
1403+
1404+
// Build cache status mapping
1405+
let cache_status: FxHashMap<(u32, String), String> = rank_nums
1406+
.iter()
1407+
.flat_map(|&rank| {
1408+
cd_by_rank
1409+
.get(&rank)
1410+
.and_then(|obj| {
1411+
dir_to_compile_id_per_rank
1412+
.get(&rank)
1413+
.map(|dir2cid| (obj, dir2cid))
1414+
})
1415+
.map(|(obj, dir2cid)| {
1416+
let status_by_dir = obj
1417+
.values()
1418+
.filter_map(|entry| entry.get("artifacts").and_then(|x| x.as_array()))
1419+
.flat_map(|arts| arts.iter())
1420+
.filter_map(|a| {
1421+
let url = a.get("url")?.as_str()?;
1422+
let prefix = url.split_once('/')?.0.to_string();
1423+
let name = a.get("name")?.as_str()?;
1424+
let status = match () {
1425+
_ if name.contains("cache_miss") => ("miss", 3),
1426+
_ if name.contains("cache_hit") => ("hit", 2),
1427+
_ if name.contains("cache_bypass") => ("bypass", 1),
1428+
_ => return None,
1429+
};
1430+
Some((prefix, status))
1431+
})
1432+
.fold(
1433+
FxHashMap::default(),
1434+
|mut acc, (prefix, (status, priority))| {
1435+
acc.entry(prefix)
1436+
.and_modify(|e: &mut (&str, u8)| {
1437+
if priority > e.1 {
1438+
*e = (status, priority);
1439+
}
1440+
})
1441+
.or_insert((status, priority));
1442+
acc
1443+
},
1444+
);
1445+
1446+
status_by_dir
1447+
.into_iter()
1448+
.filter_map(|(dir, (st, _))| {
1449+
let cid = dir2cid.get(&dir)?;
1450+
Some(((rank, cid.clone()), st.to_string()))
1451+
})
1452+
.collect::<Vec<_>>()
1453+
})
1454+
.unwrap_or_default()
1455+
})
1456+
.collect();
1457+
1458+
// Analyze and create summary
1459+
let report = analyze_execution_order(&exec_orders, &collective_by_graph, &cache_status);
1460+
1461+
let order_differs = report.by_index.iter().any(|row| {
1462+
let uniq: HashSet<_> = row.by_rank.values().map(String::as_str).collect();
1463+
uniq.len() > 1
1464+
});
1465+
1466+
let (sched_set, cache_set) = report.by_index.iter().fold(
1467+
(HashSet::new(), HashSet::new()),
1468+
|(mut sched, mut cache), row| {
1469+
if row.issues.contains(&ExecOrderIssue::ScheduleMismatch) {
1470+
sched.extend(row.by_rank.keys());
1471+
}
1472+
if row.issues.contains(&ExecOrderIssue::CacheMismatch) {
1473+
cache.extend(row.by_rank.keys());
1474+
}
1475+
(sched, cache)
1476+
},
1477+
);
1478+
1479+
let mut ranks_schedule: Vec<u32> = sched_set.into_iter().collect();
1480+
let mut ranks_cache: Vec<u32> = cache_set.into_iter().collect();
1481+
ranks_schedule.sort_unstable();
1482+
ranks_cache.sort_unstable();
1483+
1484+
let format_ranks = |ranks: &[u32]| {
1485+
if ranks.is_empty() {
1486+
String::new()
1487+
} else {
1488+
ranks
1489+
.iter()
1490+
.map(|r| format!("Rank {r}"))
1491+
.collect::<Vec<_>>()
1492+
.join(", ")
1493+
}
1494+
};
1495+
1496+
Some(ExecOrderSummary {
1497+
order_differs,
1498+
has_schedule_mismatch: !ranks_schedule.is_empty(),
1499+
has_cache_mismatch: !ranks_cache.is_empty(),
1500+
ranks_schedule_str: format_ranks(&ranks_schedule),
1501+
ranks_cache_str: format_ranks(&ranks_cache),
1502+
ranks_schedule,
1503+
ranks_cache,
1504+
})
1505+
}
1506+
12931507
fn prepare_and_validate_graphs(
12941508
runtime_estimations: &[GraphRuntime],
12951509
) -> Option<(
@@ -1863,7 +2077,7 @@ pub mod execution_order {
18632077
// Evaluate issues among present ranks
18642078
let mut issues: Vec<ExecOrderIssue> = Vec::new();
18652079

1866-
// Schedule mismatch: compare collective op sequences
2080+
// Schedule mismatch: compare collective schedules regardless of compile_id
18672081
{
18682082
let schedules: Vec<Vec<String>> = by_rank
18692083
.iter()
@@ -1880,7 +2094,7 @@ pub mod execution_order {
18802094
}
18812095
}
18822096

1883-
// Cache skew: compare cache status markers
2097+
// Cache mismatch: compare cache statuses regardless of compile_id
18842098
{
18852099
let statuses: Vec<String> = by_rank
18862100
.iter()

src/templates.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,27 @@ desync issues on specific ranks.
621621
{{ endfor }}
622622
{{ endif }}
623623
{{ endif }}
624+
<h3>Graph Execution-Order Diagnostics</h3>
625+
<p>Note: To enable this feature, wrap your code with torch._inductor.debug.record_and_log_graph_execution_order()</p>
626+
{{ if diagnostics.exec_order }}
627+
{{ if diagnostics.exec_order.order_differs }}
628+
<p>Graph execution order differs across ranks.</p>
629+
{{ else }}
630+
<p>Graph execution order: consistent across ranks.</p>
631+
{{ endif }}
632+
{{ if diagnostics.exec_order.has_schedule_mismatch }}
633+
<p><strong>Warning:</strong> Schedule mismatch across ranks: {diagnostics.exec_order.ranks_schedule_str}</p>
634+
{{ else }}
635+
<p>Collectives Schedule: Consistent across ranks.</p>
636+
{{ endif }}
637+
{{ if diagnostics.exec_order.has_cache_mismatch }}
638+
<p><strong>Warning:</strong> Cache hit/miss mismatch across ranks: {diagnostics.exec_order.ranks_cache_str}</p>
639+
{{ else }}
640+
<p>Cache hit/miss sequence: Consistent across ranks.</p>
641+
{{ endif }}
642+
{{ else }}
643+
<p>Execution-order analysis unavailable.</p>
644+
{{ endif }}
624645
<h3>Tensor Metadata Analysis</h3>
625646
<p>
626647
Compares inductor tensor metadata (shapes, dtypes, strides) across ranks to detect compilation divergence.

src/types.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -929,6 +929,24 @@ pub struct ArtifactFlags {
929929
pub runtime_trace: bool,
930930
}
931931

932+
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
933+
pub struct ExecOrderSummary {
934+
/// True if any index has differing compile_ids across ranks
935+
pub order_differs: bool,
936+
/// Ranks involved in any schedule mismatches (sorted numerically)
937+
pub ranks_schedule: Vec<u32>,
938+
/// Ranks involved in any cache hit/miss mismatches (sorted numerically)
939+
pub ranks_cache: Vec<u32>,
940+
/// True if there is any schedule mismatch
941+
pub has_schedule_mismatch: bool,
942+
/// True if there is any cache hit/miss mismatch
943+
pub has_cache_mismatch: bool,
944+
/// Pretty-printed ranks for template rendering (e.g., "r0, r1, r2")
945+
pub ranks_schedule_str: String,
946+
/// Pretty-printed ranks for template rendering (e.g., "r0, r2")
947+
pub ranks_cache_str: String,
948+
}
949+
932950
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
933951
pub struct Diagnostics {
934952
pub divergence: DivergenceFlags,
@@ -937,6 +955,7 @@ pub struct Diagnostics {
937955
pub cache_groups: Vec<DivergenceGroup>,
938956
pub collective_groups: Vec<DivergenceGroup>,
939957
pub tensor_meta_groups: Vec<DivergenceGroup>,
958+
pub exec_order: Option<ExecOrderSummary>,
940959
}
941960

942961
#[derive(Serialize)]

0 commit comments

Comments
 (0)