Add validation to ensure collectives are paired with a wait op to collective parity json (#143)

skarjala · web-flow · commit 5938ed903ed2 · 2025-08-27T14:26:59.000-07:00
This PR introduces a new field to the existing collectives_parity.json on the tlparse page. It ensures that in inductor_output_code, every collective op issued has its respective wait op. If not, the missing_wait field will update accordingly. Updated tests. <img width="1237" height="372" alt="image" src="https://github.com/user-attachments/assets/08d3c1b5-13a0-41d5-897d-8361d204a278" />
diff --git a/src/parsers.rs b/src/parsers.rs
@@ -753,7 +753,7 @@ pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow
     let call_re = Regex::new(
         r"torch\s*\.\s*ops\s*\.\s*_?c10d_functional\s*\.\s*([A-Za-z0-9_]+)\s*\.\s*default\s*\(",
     )?;
-    let comment_re = Regex::new(r"(?m)^\s*#.*$|\s#[^0-9a-fA-F].*$|//.*$|(?s)/\*.*?\*/")?;
+    let comment_re = Regex::new(r"(?m)#.*$|//.*$|(?s)/\*.*?\*/")?;
     let html_tag_re = Regex::new(r"(?s)<[^>]*>")?;
 
     for &rank in rank_nums {
@@ -787,7 +787,7 @@ pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow
                 .unwrap_or_default();
 
         let mut report = crate::types::CollectivesParityReport {
-            description: "Difference of # of collectives in scheduler and inductor output code"
+            description: "Difference of # of collectives in scheduler and inductor output code and missing wait collectives"
                 .to_string(),
             graphs: Vec::new(),
         };
@@ -848,11 +848,17 @@ pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow
                 .replace_all(&html_tag_re.replace_all(&fs::read_to_string(code)?, ""), "")
                 .into_owned();
             let mut code_counts: HashMap<&str, usize> = HashMap::new();
+            let mut wait_count = 0usize;
             for cap in call_re.captures_iter(&code_clean) {
-                if let Some(normalized) = normalize_op(cap.get(1).unwrap().as_str()) {
+                let op = cap.get(1).unwrap().as_str();
+                if op == "wait_tensor" {
+                    wait_count += 1;
+                } else if let Some(normalized) = normalize_op(op) {
                     *code_counts.entry(normalized).or_insert(0) += 1;
                 }
             }
+            let collective_total: usize = code_counts.values().sum();
+            let missing_waits = collective_total.saturating_sub(wait_count);
 
             // Compute offset over union of all detected ops
             let mut all_ops: std::collections::HashSet<&str> =
@@ -869,7 +875,7 @@ pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow
                 })
                 .sum();
 
-            if offset > 0 {
+            if offset > 0 || missing_waits > 0 {
                 let graph = compile_dir
                     .file_name()
                     .and_then(|n| n.to_str())
@@ -883,6 +889,7 @@ pub fn check_collectives_parity(out_path: &PathBuf, rank_nums: &[u32]) -> anyhow
                     graph,
                     compile_id,
                     offset,
+                    missing_waits,
                 });
             }
         }
diff --git a/src/types.rs b/src/types.rs
@@ -58,6 +58,8 @@ pub struct GraphCollectivesParity {
     pub graph: String,
     pub compile_id: String,
     pub offset: usize,
+    #[serde(default)]
+    pub missing_waits: usize,
 }
 
 #[derive(Debug, Serialize, Deserialize)]
diff --git a/tests/inputs/collectives_parity/dedicated_log_torch_trace_rank_0.log b/tests/inputs/collectives_parity/dedicated_log_torch_trace_rank_0.log
@@ -5217,7 +5217,7 @@ V0804 12:34:16.809000 1142857 torch/_inductor/graph.py:2390] {"inductor_output_c
 	        # Topologically Sorted Source Nodes: [h_1, h_2, all_reduce_default], Original ATen: [aten.gelu, aten.native_layer_norm, _c10d_functional.all_reduce]
 	        torch.ops._c10d_functional.all_reduce_.default(buf4, 'sum', '0')
 	        # Topologically Sorted Source Nodes: [h_3], Original ATen: [_c10d_functional.wait_tensor]
-	        torch.ops._c10d_functional.wait_tensor.default(buf4)
+	        #torch.ops._c10d_functional.wait_tensor.default(buf4)
 	        buf9 = empty_strided_cuda((1024, 1024), (1024, 1), torch.float16)
 	        # Topologically Sorted Source Nodes: [h2], Original ATen: [aten.mm]
 	        extern_kernels.mm(buf4, reinterpret_tensor(arg4_1, (1024, 1024), (1, 1024), 0), out=buf9)
@@ -5227,7 +5227,7 @@ V0804 12:34:16.809000 1142857 torch/_inductor/graph.py:2390] {"inductor_output_c
 	        stream0 = get_raw_stream(0)
 	        triton_poi_fused_all_gather_into_tensor_relu_1.run(buf10, 1048576, stream=stream0)
 	        # Topologically Sorted Source Nodes: [h2_1, all_gather_into_tensor_default], Original ATen: [aten.relu, _c10d_functional.all_gather_into_tensor]
-	        #buf11 = torch.ops._c10d_functional.all_gather_into_tensor.default(buf10, 2, '0')
+	        buf11 = torch.ops._c10d_functional.all_gather_into_tensor.default(buf10, 2, '0')
 	        assert_size_stride(buf11, (2048, 1024), (1024, 1), 'torch.ops._c10d_functional.all_gather_into_tensor.default')
 	        assert_alignment(buf11, 16, 'torch.ops._c10d_functional.all_gather_into_tensor.default')
 	        del buf4
@@ -5236,10 +5236,10 @@ V0804 12:34:16.809000 1142857 torch/_inductor/graph.py:2390] {"inductor_output_c
 	        assert_size_stride(buf12, (1024, 1024), (1024, 1), 'torch.ops._c10d_functional.reduce_scatter_tensor.default')
 	        assert_alignment(buf12, 16, 'torch.ops._c10d_functional.reduce_scatter_tensor.default')
 	        # Topologically Sorted Source Nodes: [gathered], Original ATen: [_c10d_functional.wait_tensor]
-	        torch.ops._c10d_functional.wait_tensor.default(buf11)
+	        #torch.ops._c10d_functional.wait_tensor.default(buf11)
 	        del buf10
 	        # Topologically Sorted Source Nodes: [rs], Original ATen: [_c10d_functional.wait_tensor]
-	        torch.ops._c10d_functional.wait_tensor.default(buf12)
+	        #torch.ops._c10d_functional.wait_tensor.default(buf12)
 	        del arg5_1
 	        buf17 = empty_strided_cuda((2048, 1024), (1024, 1), torch.float16)
 	        # Topologically Sorted Source Nodes: [g, rs_expanded, out], Original ATen: [aten.mul, aten.repeat, aten.add]
diff --git a/tests/integration_test.rs b/tests/integration_test.rs
@@ -2471,12 +2471,20 @@ fn test_collectives_parity_detects_mismatch() -> Result<(), Box<dyn std::error::
     );
     let rank_0_report: CollectivesParityReport =
         serde_json::from_str(&fs::read_to_string(&rank_0_report_path)?)?;
-    // Expect single mismatch entry for graph -_0_1_0 with compile_id [0/1] and offset 1
-    assert_eq!(rank_0_report.graphs.len(), 1);
-    let g = &rank_0_report.graphs[0];
-    assert_eq!(g.graph, "-_0_1_0");
+    // Expect mismatches: graph -_0_1_0 has offset 1, graph -_0_0_0 missing wait
+    assert_eq!(rank_0_report.graphs.len(), 2);
+    let mut by_graph = HashMap::new();
+    for g in &rank_0_report.graphs {
+        by_graph.insert(g.graph.as_str(), g);
+    }
+    let g = by_graph.get("-_0_1_0").expect("missing -_0_1_0 entry");
     assert_eq!(g.compile_id, "[0/1]");
-    assert_eq!(g.offset, 1);
+    assert_eq!(g.offset, 0);
+    assert_eq!(g.missing_waits, 3);
+    let g0 = by_graph.get("-_0_0_0").expect("missing -_0_0_0 entry");
+    assert_eq!(g0.compile_id, "[0/0]");
+    assert_eq!(g0.offset, 0);
+    assert_eq!(g0.missing_waits, 1);
 
     Ok(())
 }

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,8 @@ pub struct GraphCollectivesParity {`
`58`	`58`	`pub graph: String,`
`59`	`59`	`pub compile_id: String,`
`60`	`60`	`pub offset: usize,`
	`61`	`+ #[serde(default)]`
	`62`	`+ pub missing_waits: usize,`
`61`	`63`	`}`
`62`	`64`
`63`	`65`	`#[derive(Debug, Serialize, Deserialize)]`