@@ -25,9 +25,9 @@ mod templates;
25
25
mod types;
26
26
27
27
pub use types:: {
28
- ArtifactFlags , CollectivesParityReport , Diagnostics , DivergenceFlags , DivergenceGroup ,
29
- GraphAnalysis , GraphCollectivesParity , GraphRuntime , RankMetaData , RuntimeAnalysis ,
30
- RuntimeRankDetail ,
28
+ ArtifactFlags , CollectiveSchedule , CollectivesParityReport , Diagnostics , DivergenceFlags ,
29
+ DivergenceGroup , ExecOrderSummary , GraphAnalysis , GraphCollectivesParity , GraphRuntime ,
30
+ RankMetaData , RuntimeAnalysis , RuntimeRankDetail ,
31
31
} ;
32
32
33
33
pub use execution_order:: {
@@ -1290,6 +1290,220 @@ pub fn generate_multi_rank_html(
1290
1290
Ok ( ( landing_page_path, html) )
1291
1291
}
1292
1292
1293
+ /// Build ExecOrderSummary from artifacts under out_path for the given ranks
1294
+ pub fn build_exec_order_summary (
1295
+ out_path : & PathBuf ,
1296
+ rank_nums : & [ u32 ] ,
1297
+ collective_schedules : & [ CollectiveSchedule ] ,
1298
+ ) -> Option < ExecOrderSummary > {
1299
+ use crate :: execution_order:: {
1300
+ analyze_execution_order, parse_graph_execution_order, ExecOrderIssue ,
1301
+ } ;
1302
+ use std:: collections:: HashSet ;
1303
+
1304
+ // Preload and parse compile_directory.json per rank
1305
+ let cd_by_rank: FxHashMap < u32 , serde_json:: Map < String , serde_json:: Value > > = rank_nums
1306
+ . iter ( )
1307
+ . filter_map ( |& rank| {
1308
+ let path = out_path
1309
+ . join ( format ! ( "rank_{rank}" ) )
1310
+ . join ( "compile_directory.json" ) ;
1311
+ std:: fs:: read_to_string ( path)
1312
+ . ok ( )
1313
+ . and_then ( |content| serde_json:: from_str :: < serde_json:: Value > ( & content) . ok ( ) )
1314
+ . and_then ( |val| match val {
1315
+ serde_json:: Value :: Object ( map) => Some ( ( rank, map) ) ,
1316
+ _ => None ,
1317
+ } )
1318
+ } )
1319
+ . collect ( ) ;
1320
+
1321
+ // Collect latest "graph_execution" artifact per rank
1322
+ let exec_orders: FxHashMap < u32 , Vec < String > > = rank_nums
1323
+ . iter ( )
1324
+ . filter_map ( |& rank| {
1325
+ let map = cd_by_rank. get ( & rank) ?;
1326
+ let rank_dir = out_path. join ( format ! ( "rank_{rank}" ) ) ;
1327
+
1328
+ // Find latest graph_execution artifact
1329
+ let best = map
1330
+ . values ( )
1331
+ . filter_map ( |entry| entry. get ( "artifacts" ) ?. as_array ( ) )
1332
+ . flat_map ( |arts| arts. iter ( ) )
1333
+ . filter_map ( |a| {
1334
+ let name = a. get ( "name" ) ?. as_str ( ) ?;
1335
+ if !name. contains ( "graph_execution" ) || !name. ends_with ( ".json" ) {
1336
+ return None ;
1337
+ }
1338
+ Some ( (
1339
+ a. get ( "number" ) ?. as_u64 ( ) ?,
1340
+ a. get ( "url" ) ?. as_str ( ) ?. to_string ( ) ,
1341
+ ) )
1342
+ } )
1343
+ . max_by_key ( |( num, _) | * num) ?;
1344
+
1345
+ let path = rank_dir. join ( best. 1 ) ;
1346
+ let order = std:: fs:: read_to_string ( path)
1347
+ . ok ( )
1348
+ . and_then ( |payload| parse_graph_execution_order ( & payload) . ok ( ) )
1349
+ . map ( |order| order. into_iter ( ) . map ( |s| format ! ( "[{}]" , s) ) . collect ( ) ) ?;
1350
+ Some ( ( rank, order) )
1351
+ } )
1352
+ . collect ( ) ;
1353
+
1354
+ if exec_orders. len ( ) < 2 {
1355
+ return None ;
1356
+ }
1357
+
1358
+ // Build dir -> compile_id mapping per rank
1359
+ let dir_to_compile_id_per_rank: FxHashMap < u32 , FxHashMap < String , String > > = rank_nums
1360
+ . iter ( )
1361
+ . filter_map ( |& rank| {
1362
+ let obj = cd_by_rank. get ( & rank) ?;
1363
+ let mapping = obj
1364
+ . iter ( )
1365
+ . flat_map ( |( cid, entry) | {
1366
+ entry
1367
+ . get ( "artifacts" )
1368
+ . and_then ( |x| x. as_array ( ) )
1369
+ . map ( |arts| {
1370
+ arts. iter ( )
1371
+ . filter_map ( |a| {
1372
+ let url = a. get ( "url" ) ?. as_str ( ) ?;
1373
+ let prefix = url. split_once ( '/' ) ?. 0 ;
1374
+ Some ( ( prefix. to_string ( ) , cid. to_string ( ) ) )
1375
+ } )
1376
+ . collect :: < Vec < _ > > ( )
1377
+ } )
1378
+ . unwrap_or_default ( )
1379
+ } )
1380
+ . fold ( FxHashMap :: default ( ) , |mut acc, ( prefix, cid) | {
1381
+ acc. entry ( prefix) . or_insert ( cid) ;
1382
+ acc
1383
+ } ) ;
1384
+ Some ( ( rank, mapping) )
1385
+ } )
1386
+ . collect ( ) ;
1387
+
1388
+ // Build collective ops mapping
1389
+ let collective_by_graph: FxHashMap < ( u32 , String ) , Vec < String > > = collective_schedules
1390
+ . iter ( )
1391
+ . filter_map ( |cs| {
1392
+ let m = dir_to_compile_id_per_rank. get ( & cs. rank ) ?;
1393
+ let compile_id = m
1394
+ . get ( & cs. graph )
1395
+ . cloned ( )
1396
+ . unwrap_or_else ( || cs. graph . clone ( ) ) ;
1397
+ Some ( ( ( cs. rank , compile_id) , cs. ops . clone ( ) ) )
1398
+ } )
1399
+ . fold ( FxHashMap :: default ( ) , |mut acc, ( key, ops) | {
1400
+ acc. entry ( key) . or_default ( ) . extend ( ops) ;
1401
+ acc
1402
+ } ) ;
1403
+
1404
+ // Build cache status mapping
1405
+ let cache_status: FxHashMap < ( u32 , String ) , String > = rank_nums
1406
+ . iter ( )
1407
+ . flat_map ( |& rank| {
1408
+ cd_by_rank
1409
+ . get ( & rank)
1410
+ . and_then ( |obj| {
1411
+ dir_to_compile_id_per_rank
1412
+ . get ( & rank)
1413
+ . map ( |dir2cid| ( obj, dir2cid) )
1414
+ } )
1415
+ . map ( |( obj, dir2cid) | {
1416
+ let status_by_dir = obj
1417
+ . values ( )
1418
+ . filter_map ( |entry| entry. get ( "artifacts" ) . and_then ( |x| x. as_array ( ) ) )
1419
+ . flat_map ( |arts| arts. iter ( ) )
1420
+ . filter_map ( |a| {
1421
+ let url = a. get ( "url" ) ?. as_str ( ) ?;
1422
+ let prefix = url. split_once ( '/' ) ?. 0 . to_string ( ) ;
1423
+ let name = a. get ( "name" ) ?. as_str ( ) ?;
1424
+ let status = match ( ) {
1425
+ _ if name. contains ( "cache_miss" ) => ( "miss" , 3 ) ,
1426
+ _ if name. contains ( "cache_hit" ) => ( "hit" , 2 ) ,
1427
+ _ if name. contains ( "cache_bypass" ) => ( "bypass" , 1 ) ,
1428
+ _ => return None ,
1429
+ } ;
1430
+ Some ( ( prefix, status) )
1431
+ } )
1432
+ . fold (
1433
+ FxHashMap :: default ( ) ,
1434
+ |mut acc, ( prefix, ( status, priority) ) | {
1435
+ acc. entry ( prefix)
1436
+ . and_modify ( |e : & mut ( & str , u8 ) | {
1437
+ if priority > e. 1 {
1438
+ * e = ( status, priority) ;
1439
+ }
1440
+ } )
1441
+ . or_insert ( ( status, priority) ) ;
1442
+ acc
1443
+ } ,
1444
+ ) ;
1445
+
1446
+ status_by_dir
1447
+ . into_iter ( )
1448
+ . filter_map ( |( dir, ( st, _) ) | {
1449
+ let cid = dir2cid. get ( & dir) ?;
1450
+ Some ( ( ( rank, cid. clone ( ) ) , st. to_string ( ) ) )
1451
+ } )
1452
+ . collect :: < Vec < _ > > ( )
1453
+ } )
1454
+ . unwrap_or_default ( )
1455
+ } )
1456
+ . collect ( ) ;
1457
+
1458
+ // Analyze and create summary
1459
+ let report = analyze_execution_order ( & exec_orders, & collective_by_graph, & cache_status) ;
1460
+
1461
+ let order_differs = report. by_index . iter ( ) . any ( |row| {
1462
+ let uniq: HashSet < _ > = row. by_rank . values ( ) . map ( String :: as_str) . collect ( ) ;
1463
+ uniq. len ( ) > 1
1464
+ } ) ;
1465
+
1466
+ let ( sched_set, cache_set) = report. by_index . iter ( ) . fold (
1467
+ ( HashSet :: new ( ) , HashSet :: new ( ) ) ,
1468
+ |( mut sched, mut cache) , row| {
1469
+ if row. issues . contains ( & ExecOrderIssue :: ScheduleMismatch ) {
1470
+ sched. extend ( row. by_rank . keys ( ) ) ;
1471
+ }
1472
+ if row. issues . contains ( & ExecOrderIssue :: CacheMismatch ) {
1473
+ cache. extend ( row. by_rank . keys ( ) ) ;
1474
+ }
1475
+ ( sched, cache)
1476
+ } ,
1477
+ ) ;
1478
+
1479
+ let mut ranks_schedule: Vec < u32 > = sched_set. into_iter ( ) . collect ( ) ;
1480
+ let mut ranks_cache: Vec < u32 > = cache_set. into_iter ( ) . collect ( ) ;
1481
+ ranks_schedule. sort_unstable ( ) ;
1482
+ ranks_cache. sort_unstable ( ) ;
1483
+
1484
+ let format_ranks = |ranks : & [ u32 ] | {
1485
+ if ranks. is_empty ( ) {
1486
+ String :: new ( )
1487
+ } else {
1488
+ ranks
1489
+ . iter ( )
1490
+ . map ( |r| format ! ( "Rank {r}" ) )
1491
+ . collect :: < Vec < _ > > ( )
1492
+ . join ( ", " )
1493
+ }
1494
+ } ;
1495
+
1496
+ Some ( ExecOrderSummary {
1497
+ order_differs,
1498
+ has_schedule_mismatch : !ranks_schedule. is_empty ( ) ,
1499
+ has_cache_mismatch : !ranks_cache. is_empty ( ) ,
1500
+ ranks_schedule_str : format_ranks ( & ranks_schedule) ,
1501
+ ranks_cache_str : format_ranks ( & ranks_cache) ,
1502
+ ranks_schedule,
1503
+ ranks_cache,
1504
+ } )
1505
+ }
1506
+
1293
1507
fn prepare_and_validate_graphs (
1294
1508
runtime_estimations : & [ GraphRuntime ] ,
1295
1509
) -> Option < (
@@ -1863,7 +2077,7 @@ pub mod execution_order {
1863
2077
// Evaluate issues among present ranks
1864
2078
let mut issues: Vec < ExecOrderIssue > = Vec :: new ( ) ;
1865
2079
1866
- // Schedule mismatch: compare collective op sequences
2080
+ // Schedule mismatch: compare collective schedules regardless of compile_id
1867
2081
{
1868
2082
let schedules: Vec < Vec < String > > = by_rank
1869
2083
. iter ( )
@@ -1880,7 +2094,7 @@ pub mod execution_order {
1880
2094
}
1881
2095
}
1882
2096
1883
- // Cache skew : compare cache status markers
2097
+ // Cache mismatch : compare cache statuses regardless of compile_id
1884
2098
{
1885
2099
let statuses: Vec < String > = by_rank
1886
2100
. iter ( )
0 commit comments