Add Collective Operations Parity Check Between Scheduler and Inductor Output Code (#136)

skarjala · web-flow · commit fc687f8f6803 · 2025-08-26T23:49:39.000-07:00
Collective Operations Parity Analysis - Compares collective operations between two sources: - Scheduler artifacts: Operations planned in inductor_collective_schedule_*.json files - Generated code: Actual collective calls present in inductor_output_code_*.py files - Counts occurrences of 6 collective operation types: all_reduce, reduce_scatter, all_gather, broadcast, reduce, all_to_all - Calculates the absolute difference (offset) between planned vs actual operations - Generates a collectives_parity.json report for each rank <img width="1093" height="754" alt="Screenshot 2025-08-22 at 9 49 51 AM" src="https://github.com/user-attachments/assets/13fe8bbe-5110-4060-b29f-f90d4f52b26a" /> - Ex: <img width="906" height="504" alt="image" src="https://github.com/user-attachments/assets/7f299a81-0da0-45cc-b80e-84f78cfcb7c4" />
diff --git a/src/cli.rs b/src/cli.rs
@@ -459,6 +459,8 @@ fn handle_all_ranks(
         println!("Collective schedules: {}", schedules_path.display());
     }
 
+    tlparse::parsers::check_collectives_parity(&out_path, &rank_nums)?;
+
     // Process tensor meta fingerprints from all ranks
     let tensor_meta = tlparse::parsers::read_tensor_meta_fingerprints(&out_path, &rank_nums)?;
     let mut tensor_meta_groups: FxHashMap<String, Vec<u32>> = FxHashMap::default();
diff --git a/src/lib.rs b/src/lib.rs
@@ -25,8 +25,9 @@ mod templates;
 mod types;
 
 pub use types::{
-    ArtifactFlags, Diagnostics, DivergenceFlags, DivergenceGroup, GraphAnalysis, GraphRuntime,
-    RankMetaData, RuntimeAnalysis, RuntimeRankDetail,
+    ArtifactFlags, CollectivesParityReport, Diagnostics, DivergenceFlags, DivergenceGroup,
+    GraphAnalysis, GraphCollectivesParity, GraphRuntime, RankMetaData, RuntimeAnalysis,
+    RuntimeRankDetail,
 };
 
 pub use execution_order::{
diff --git a/src/parsers.rs b/src/parsers.rs
@@ -745,6 +745,157 @@ pub fn read_collective_schedules(
     )
 }
 
+pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow::Result<()> {
+    use regex::Regex;
+    use std::{collections::HashMap, fs};
+
+    // Match c10d functional calls: torch.ops._c10d_functional.<op>.default(
+    let call_re = Regex::new(
+        r"torch\s*\.\s*ops\s*\.\s*_?c10d_functional\s*\.\s*([A-Za-z0-9_]+)\s*\.\s*default\s*\(",
+    )?;
+    let comment_re = Regex::new(r"(?m)^\s*#.*$|\s#[^0-9a-fA-F].*$|//.*$|(?s)/\*.*?\*/")?;
+    let html_tag_re = Regex::new(r"(?s)<[^>]*>")?;
+
+    for &rank in rank_nums {
+        let rank_dir = out_path.join(format!("rank_{rank}"));
+        if !rank_dir.exists() {
+            continue;
+        }
+
+        // Map compile directory (graph folder) name prefix -> compile ID
+        let dir_to_compile_id: HashMap<String, String> =
+            fs::read_to_string(rank_dir.join("compile_directory.json"))
+                .ok()
+                .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
+                .and_then(|v| {
+                    v.as_object().map(|obj| {
+                        obj.iter().fold(HashMap::new(), |mut m, (cid, entry)| {
+                            if let Some(arts) = entry.get("artifacts").and_then(|x| x.as_array()) {
+                                for a in arts {
+                                    if let Some(url) = a.get("url").and_then(|x| x.as_str()) {
+                                        if let Some((prefix, _)) = url.split_once('/') {
+                                            m.entry(prefix.to_string())
+                                                .or_insert_with(|| cid.to_string());
+                                        }
+                                    }
+                                }
+                            }
+                            m
+                        })
+                    })
+                })
+                .unwrap_or_default();
+
+        let mut report = crate::types::CollectivesParityReport {
+            description: "Difference of # of collectives in scheduler and inductor output code"
+                .to_string(),
+            graphs: Vec::new(),
+        };
+
+        for compile_dir in fs::read_dir(&rank_dir)?
+            .flatten()
+            .map(|e| e.path())
+            .filter(|p| p.is_dir())
+        {
+            let (mut schedule_path, mut code_path) = (None, None);
+            for p in fs::read_dir(&compile_dir)?.flatten().map(|e| e.path()) {
+                let stem = p.file_stem().and_then(|s| s.to_str()).unwrap_or("");
+                if p.extension() == Some(OsStr::new("json"))
+                    && stem.starts_with("inductor_collective_schedule")
+                {
+                    schedule_path = Some(p);
+                } else if stem.starts_with("inductor_output_code") && code_path.is_none() {
+                    code_path = Some(p);
+                }
+            }
+
+            let (Some(schedule), Some(code)) = (schedule_path, code_path) else {
+                continue;
+            };
+
+            let raw_ops: Vec<String> =
+                serde_json::from_str(&fs::read_to_string(schedule)?).unwrap_or_default();
+            // Extract and normalize op names from schedule
+            let normalize_op = |op: &str| -> Option<&'static str> {
+                let op = op.trim_end_matches('_');
+                [
+                    "all_reduce",
+                    "reduce_scatter",
+                    "all_gather",
+                    "broadcast",
+                    "all_to_all",
+                ]
+                .iter()
+                .find(|&&name| op.contains(name))
+                .copied()
+                .or_else(|| {
+                    (op.contains("reduce")
+                        && !op.contains("all_reduce")
+                        && !op.contains("reduce_scatter"))
+                    .then_some("reduce")
+                })
+            };
+
+            let mut schedule_counts: HashMap<&str, usize> = HashMap::new();
+            for op in &raw_ops {
+                if let Some(normalized) = normalize_op(op) {
+                    *schedule_counts.entry(normalized).or_insert(0) += 1;
+                }
+            }
+
+            // Code counts: strip tags and comments, then count calls
+            let code_clean = comment_re
+                .replace_all(&html_tag_re.replace_all(&fs::read_to_string(code)?, ""), "")
+                .into_owned();
+            let mut code_counts: HashMap<&str, usize> = HashMap::new();
+            for cap in call_re.captures_iter(&code_clean) {
+                if let Some(normalized) = normalize_op(cap.get(1).unwrap().as_str()) {
+                    *code_counts.entry(normalized).or_insert(0) += 1;
+                }
+            }
+
+            // Compute offset over union of all detected ops
+            let mut all_ops: std::collections::HashSet<&str> =
+                schedule_counts.keys().copied().collect();
+            all_ops.extend(code_counts.keys().copied());
+            let offset: usize = all_ops
+                .iter()
+                .map(|&n| {
+                    schedule_counts
+                        .get(n)
+                        .copied()
+                        .unwrap_or(0)
+                        .abs_diff(code_counts.get(n).copied().unwrap_or(0))
+                })
+                .sum();
+
+            if offset > 0 {
+                let graph = compile_dir
+                    .file_name()
+                    .and_then(|n| n.to_str())
+                    .unwrap_or("unknown")
+                    .to_string();
+                let compile_id = dir_to_compile_id
+                    .get(&graph)
+                    .cloned()
+                    .unwrap_or_else(|| "unknown".to_string());
+                report.graphs.push(crate::types::GraphCollectivesParity {
+                    graph,
+                    compile_id,
+                    offset,
+                });
+            }
+        }
+
+        fs::write(
+            rank_dir.join("collectives_parity.json"),
+            serde_json::to_string_pretty(&report)?,
+        )?;
+    }
+
+    Ok(())
+}
+
 /// Parses a prefixed JSON file from each multi-rank output directory.
 /// It finds the first matching file, calls `parse_fn` on its contents,
 /// and collects the `Some(T)` results into a vector.
diff --git a/src/templates.rs b/src/templates.rs
@@ -185,6 +185,9 @@ PT2 generates <a href='chromium_events.json'>Chromium Trace Events</a> in JSON o
 You can download and view them in a tool like <a href='https://ui.perfetto.dev/'>Perfetto</a>.
 {{ endif  }}
 <p>
+<a href="collectives_parity.json">Collectives Parity report</a> comparing scheduler and Inductor output code collective operations.
+</p>
+<p>
 Build products below:
 </p>
 <ul>
diff --git a/src/types.rs b/src/types.rs
@@ -53,6 +53,18 @@ pub struct TensorMetaFingerprint {
     pub fingerprint: String,
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct GraphCollectivesParity {
+    pub graph: String,
+    pub compile_id: String,
+    pub offset: usize,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct CollectivesParityReport {
+    pub description: String,
+    pub graphs: Vec<GraphCollectivesParity>,
+}
 /// Estimated runtime entry for a single op within a graph.
 #[derive(Debug, Serialize, Deserialize)]
 pub struct OpRuntime {
diff --git a/tests/inputs/collectives_parity/dedicated_log_torch_trace_rank_0.log b/tests/inputs/collectives_parity/dedicated_log_torch_trace_rank_0.log
diff --git a/tests/integration_test.rs b/tests/integration_test.rs

Original file line number	Diff line number	Diff line change
`@@ -459,6 +459,8 @@ fn handle_all_ranks(`
`459`	`459`	`println!("Collective schedules: {}", schedules_path.display());`
`460`	`460`	`}`
`461`	`461`
	`462`	`+ tlparse::parsers::check_collectives_parity(&out_path, &rank_nums)?;`
	`463`	`+`
`462`	`464`	`// Process tensor meta fingerprints from all ranks`
`463`	`465`	`let tensor_meta = tlparse::parsers::read_tensor_meta_fingerprints(&out_path, &rank_nums)?;`
`464`	`466`	`let mut tensor_meta_groups: FxHashMap<String, Vec<u32>> = FxHashMap::default();`