【Hackathon 9th No.112】feat(fix): stabilize 17 torch samples by constraints and -inf replacement (no NaN/Inf) (#321)

Dayuxiaoshui · web-flow · commit acbc3e33d78b · 2025-11-03T18:44:38.000+08:00
* fix: add min/max constraints to prevent NaN/inf in illegal torch samples

- Enhanced replay_tensor() to support min_val and max_val clamping for all dtypes
- Updated convert_meta_classes_to_tensors() to handle constraints separately for int vs float
- Added min_val=0.0, max_val=1.0 constraints to reference_points tensors in:
  - IDEA-Research_grounding-dino-base
  - fushh7_llmdet_swin_tiny_hf
- This fixes NaN/inf issues caused by unchecked tensor value ranges

Related to: NO.112

* fix: resolve NaN/inf issues in IDEA-Research_grounding-dino-base

- Enhanced replay_tensor() with numerical stability checks for floating-point tensors
- Added comprehensive min_val/max_val constraints to all tensors in weight_meta.py
- Replaced -inf with -1e6 in model.py to prevent NaN propagation in sigmoid operations
- Fixed std=0 case handling to avoid generating identical values
- Both nope and inductor backends now pass without NaN/inf

This completes the fix for NO.112 illegal torch samples.

* fix: resolve NaN issue in fushh7_llmdet_swin_tiny_hf and improve code formatting

- Replaced -inf with -1e6 in fushh7_llmdet_swin_tiny_hf/model.py (same fix as IDEA-Research_grounding-dino-base)
- Improved code formatting in utils.py (removed trailing spaces, better line breaks)
- Both nope and inductor backends now pass without NaN for this sample

* style(utils): format long condition to satisfy black hook

* fix: resolve NaN/Inf issues in 17 illegal Torch samples

- Add -inf to -1e6 replacement logic in apply_templates for newly generated models
- Add runtime replacement logic in load_class_from_file for existing models
- Fix NaN issues in masked_fill and torch.full calls that use -inf
- Ensure all 17 samples pass test_compiler with both nope and inductor backends
- No manual modification of auto-generated model.py files required

* fix: resolve NaN/Inf issues in 17 illegal Torch samples

- Add -inf to -1e6 replacement logic in apply_templates for newly generated models
- Add runtime replacement logic in load_class_from_file for existing models
- Fix NaN issues in masked_fill and torch.full calls that use -inf
- Ensure all 17 samples pass test_compiler with both nope and inductor backends
- No manual modification of auto-generated model.py files required

* fix: restore -inf in model.py files (auto-generated files should keep original format)

* 修复17个样本的NaN问题：移除test_compiler中的-inf特殊处理，修复样本model.py中的-inf使用

- 从test_compiler.py移除-inf修复代码（通用组件不应包含特定算子处理）
- 修复IDEA-Research_grounding-dino-base和fushh7_llmdet_swin_tiny_hf的model.py，将-inf替换为-1e6
- 验证所有17个问题样本在inductor和nope后端均不再出现NaN
- 修复方案：仅在样本层面修复-inf问题，不修改通用组件
diff --git a/graph_net/torch/utils.py b/graph_net/torch/utils.py
@@ -221,7 +221,17 @@ def convert_meta_classes_to_tensors(file_path):
         data_type = getattr(torch, attrs.get("dtype", "torch.float").split(".")[-1])
         shape = attrs.get("shape", [])
 
-        if "min_val" in attrs and "max_val" in attrs:
+        if (
+            "min_val" in attrs
+            and "max_val" in attrs
+            and data_type
+            in [
+                torch.int8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]
+        ):
             min_val = attrs["min_val"]
             max_val = attrs["max_val"]
             # torch.randint's upper bound is exclusive, so add 1
@@ -242,9 +252,11 @@ def convert_meta_classes_to_tensors(file_path):
             "mean": attrs.get("mean", 0.0),
             "std": attrs.get("std", 1.0),
         }
-        # Include min_val if present (for batch_norm running_var constraints)
+        # Include constraints if present (floats will be clamped in replay_tensor)
         if "min_val" in attrs:
             info_dict["min_val"] = attrs["min_val"]
+        if "max_val" in attrs:
+            info_dict["max_val"] = attrs["max_val"]
 
         yield {
             "info": info_dict,
@@ -280,12 +292,28 @@ def replay_tensor(info):
         std = 0.1
     if mean is None:
         mean = 0
-    tensor = torch.randn(size=shape).to(dtype).to(device) * std * 0.2 + mean
+    # Handle std = 0 case to avoid generating identical values
+    if std == 0:
+        tensor = torch.full(size=shape, fill_value=mean, dtype=dtype, device=device)
+    else:
+        tensor = torch.randn(size=shape).to(dtype).to(device) * std * 0.2 + mean
 
-    # Apply min_val constraint if present (for batch_norm running_var)
+    # Apply lower/upper bound constraints if present
     if "min_val" in info["info"]:
         min_val = info["info"]["min_val"]
         tensor = torch.clamp(tensor, min=min_val)
+    if "max_val" in info["info"]:
+        max_val = info["info"]["max_val"]
+        tensor = torch.clamp(tensor, max=max_val)
+
+    # Additional numerical stability checks
+    if dtype.is_floating_point:
+        # Replace any inf or nan values with small random values
+        tensor = torch.where(
+            torch.isfinite(tensor), tensor, torch.randn_like(tensor) * 0.01
+        )
+        # Ensure no extremely large values
+        tensor = torch.clamp(tensor, min=-100.0, max=100.0)
 
     return tensor
 
diff --git a/samples/transformers-auto-model/IDEA-Research_grounding-dino-base/model.py b/samples/transformers-auto-model/IDEA-Research_grounding-dino-base/model.py
@@ -46,10 +46,10 @@ def forward(
         bool_1 = None
         invert = ~getitem_1
         getitem_1 = None
-        output_1 = output.masked_fill(invert, -inf)
+        output_1 = output.masked_fill(invert, -1e6)
         output = invert = None
         new_output = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output[(Ellipsis, slice(None, 7, None))] = output_1
         setitem = new_output
@@ -95,10 +95,10 @@ def forward(
         bool_2 = None
         invert_1 = ~getitem_5
         getitem_5 = None
-        output_3 = output_2.masked_fill(invert_1, -inf)
+        output_3 = output_2.masked_fill(invert_1, -1e6)
         output_2 = invert_1 = None
         new_output_1 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_1[(Ellipsis, slice(None, 7, None))] = output_3
         setitem_1 = new_output_1
@@ -144,10 +144,10 @@ def forward(
         bool_3 = None
         invert_2 = ~getitem_9
         getitem_9 = None
-        output_5 = output_4.masked_fill(invert_2, -inf)
+        output_5 = output_4.masked_fill(invert_2, -1e6)
         output_4 = invert_2 = None
         new_output_2 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_2[(Ellipsis, slice(None, 7, None))] = output_5
         setitem_2 = new_output_2
@@ -193,10 +193,10 @@ def forward(
         bool_4 = None
         invert_3 = ~getitem_13
         getitem_13 = None
-        output_7 = output_6.masked_fill(invert_3, -inf)
+        output_7 = output_6.masked_fill(invert_3, -1e6)
         output_6 = invert_3 = None
         new_output_3 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_3[(Ellipsis, slice(None, 7, None))] = output_7
         setitem_3 = new_output_3
@@ -242,10 +242,10 @@ def forward(
         bool_5 = None
         invert_4 = ~getitem_17
         getitem_17 = None
-        output_9 = output_8.masked_fill(invert_4, -inf)
+        output_9 = output_8.masked_fill(invert_4, -1e6)
         output_8 = invert_4 = None
         new_output_4 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_4[(Ellipsis, slice(None, 7, None))] = output_9
         setitem_4 = new_output_4
@@ -294,10 +294,10 @@ def forward(
         bool_6 = None
         invert_5 = ~getitem_21
         getitem_21 = None
-        output_11 = output_10.masked_fill(invert_5, -inf)
+        output_11 = output_10.masked_fill(invert_5, -1e6)
         output_10 = invert_5 = None
         new_output_5 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_5[(Ellipsis, slice(None, 7, None))] = output_11
         setitem_5 = new_output_5
diff --git a/samples/transformers-auto-model/IDEA-Research_grounding-dino-base/weight_meta.py b/samples/transformers-auto-model/IDEA-Research_grounding-dino-base/weight_meta.py
@@ -6,6 +6,8 @@ class Program_weight_tensor_meta_L_stack0_encoder_last_hidden_state_text:
     mean = 0.000
     std = 1.000
     data = None
+    min_val = -10.0
+    max_val = 10.0
 
 
 class Program_weight_tensor_meta_L_stack0_intermediate_hidden_states:
@@ -16,6 +18,8 @@ class Program_weight_tensor_meta_L_stack0_intermediate_hidden_states:
     mean = 0.000
     std = 1.000
     data = None
+    min_val = -10.0
+    max_val = 10.0
 
 
 class Program_weight_tensor_meta_L_stack0_init_reference_points:
@@ -26,6 +30,8 @@ class Program_weight_tensor_meta_L_stack0_init_reference_points:
     mean = 0.400
     std = 0.296
     data = None
+    min_val = 0.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_stack0_intermediate_reference_points:
@@ -36,6 +42,8 @@ class Program_weight_tensor_meta_L_stack0_intermediate_reference_points:
     mean = 0.400
     std = 0.296
     data = None
+    min_val = 0.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_attention_mask_:
@@ -56,6 +64,8 @@ class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_lay
     mean = -0.000
     std = 0.020
     data = None
+    min_val = -1.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_layers_modules_0_parameters_bias_:
@@ -68,6 +78,8 @@ class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_lay
     mean = 0.000
     std = 0.000
     data = None
+    min_val = -1.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_layers_modules_1_parameters_weight_:
@@ -78,6 +90,8 @@ class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_lay
     mean = 0.000
     std = 0.020
     data = None
+    min_val = -1.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_layers_modules_1_parameters_bias_:
@@ -90,6 +104,8 @@ class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_lay
     mean = 0.000
     std = 0.000
     data = None
+    min_val = -1.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_layers_modules_2_parameters_weight_:
@@ -100,6 +116,8 @@ class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_lay
     mean = 0.000
     std = 0.000
     data = None
+    min_val = -1.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_self_modules_bbox_embed_modules_0_modules_layers_modules_2_parameters_bias_:
diff --git a/samples/transformers-auto-model/fushh7_llmdet_swin_tiny_hf/model.py b/samples/transformers-auto-model/fushh7_llmdet_swin_tiny_hf/model.py
@@ -106,10 +106,10 @@ def forward(
         bool_1 = None
         invert = ~getitem_1
         getitem_1 = None
-        output_1 = output.masked_fill(invert, -inf)
+        output_1 = output.masked_fill(invert, -1e6)
         output = invert = None
         new_output = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output[(Ellipsis, slice(None, 7, None))] = output_1
         setitem = new_output
@@ -155,10 +155,10 @@ def forward(
         bool_2 = None
         invert_1 = ~getitem_5
         getitem_5 = None
-        output_3 = output_2.masked_fill(invert_1, -inf)
+        output_3 = output_2.masked_fill(invert_1, -1e6)
         output_2 = invert_1 = None
         new_output_1 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_1[(Ellipsis, slice(None, 7, None))] = output_3
         setitem_1 = new_output_1
@@ -204,10 +204,10 @@ def forward(
         bool_3 = None
         invert_2 = ~getitem_9
         getitem_9 = None
-        output_5 = output_4.masked_fill(invert_2, -inf)
+        output_5 = output_4.masked_fill(invert_2, -1e6)
         output_4 = invert_2 = None
         new_output_2 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_2[(Ellipsis, slice(None, 7, None))] = output_5
         setitem_2 = new_output_2
@@ -253,10 +253,10 @@ def forward(
         bool_4 = None
         invert_3 = ~getitem_13
         getitem_13 = None
-        output_7 = output_6.masked_fill(invert_3, -inf)
+        output_7 = output_6.masked_fill(invert_3, -1e6)
         output_6 = invert_3 = None
         new_output_3 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_3[(Ellipsis, slice(None, 7, None))] = output_7
         setitem_3 = new_output_3
@@ -302,10 +302,10 @@ def forward(
         bool_5 = None
         invert_4 = ~getitem_17
         getitem_17 = None
-        output_9 = output_8.masked_fill(invert_4, -inf)
+        output_9 = output_8.masked_fill(invert_4, -1e6)
         output_8 = invert_4 = None
         new_output_4 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_4[(Ellipsis, slice(None, 7, None))] = output_9
         setitem_4 = new_output_4
@@ -354,10 +354,10 @@ def forward(
         bool_6 = None
         invert_5 = ~getitem_21
         getitem_21 = None
-        output_11 = output_10.masked_fill(invert_5, -inf)
+        output_11 = output_10.masked_fill(invert_5, -1e6)
         output_10 = invert_5 = None
         new_output_5 = torch.full(
-            (1, 900, 256), -inf, device=device(type="cuda", index=0)
+            (1, 900, 256), -1e6, device=device(type="cuda", index=0)
         )
         new_output_5[(Ellipsis, slice(None, 7, None))] = output_11
         setitem_5 = new_output_5
diff --git a/samples/transformers-auto-model/fushh7_llmdet_swin_tiny_hf/weight_meta.py b/samples/transformers-auto-model/fushh7_llmdet_swin_tiny_hf/weight_meta.py
@@ -26,6 +26,8 @@ class Program_weight_tensor_meta_L_stack0_init_reference_points:
     mean = 0.347
     std = 0.339
     data = None
+    min_val = 0.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_stack0_intermediate_reference_points:
@@ -36,6 +38,8 @@ class Program_weight_tensor_meta_L_stack0_intermediate_reference_points:
     mean = 0.347
     std = 0.339
     data = None
+    min_val = 0.0
+    max_val = 1.0
 
 
 class Program_weight_tensor_meta_L_attention_mask_: