diff --git a/graph_net/torch/extractor.py b/graph_net/torch/extractor.py index 4c6f34778..1a3f28bb4 100644 --- a/graph_net/torch/extractor.py +++ b/graph_net/torch/extractor.py @@ -7,6 +7,9 @@ torch._dynamo.config.capture_scalar_outputs = True torch._dynamo.config.capture_dynamic_output_shape_ops = True +torch._dynamo.config.capture_sparse_compute = True +torch._dynamo.config.raise_on_ctx_manager_usage = False +torch._dynamo.config.allow_rnn = True def extract(name, dynamic=True, mut_graph_codes=None, placeholder_auto_rename=False): @@ -84,6 +87,11 @@ def wrapper(model: torch.nn.Module): class GraphExtractor: def __init__(self): self.subgraph_counter = 0 + self.workspace_path = os.environ.get("GRAPH_NET_EXTRACT_WORKSPACE") + if not self.workspace_path: + raise EnvironmentError( + "Environment variable 'GRAPH_NET_EXTRACT_WORKSPACE' is not set." + ) def move_files(self, source_dir, target_dir): os.makedirs(target_dir, exist_ok=True) @@ -94,13 +102,8 @@ def move_files(self, source_dir, target_dir): shutil.move(source_path, target_path) def __call__(self, gm: torch.fx.GraphModule, sample_inputs): - # 1. Get workspace path - workspace_path = os.environ.get("GRAPH_NET_EXTRACT_WORKSPACE") - if not workspace_path: - raise EnvironmentError( - "Environment variable 'GRAPH_NET_EXTRACT_WORKSPACE' is not set." - ) - model_path = os.path.join(workspace_path, name) + # 1. Get model path + model_path = os.path.join(self.workspace_path, name) os.makedirs(model_path, exist_ok=True) if self.subgraph_counter == 0: @@ -140,6 +143,15 @@ def try_rename_placeholder(node): input = torch.tensor(4) params[node.target] = input input_idx += 1 + + if node.op == "call_function" and hasattr(node.target, "__name__"): + if node.target.__name__ in [ + "_enter_autocast", + "_exit_autocast", + ]: + node.replace_all_uses_with(node.args[0]) + gm.graph.erase_node(node) + assert input_idx == len(sample_inputs) if mut_graph_codes is not None: assert isinstance(mut_graph_codes, list) diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/graph_hash.txt b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_hash.txt new file mode 100644 index 000000000..17afafe70 --- /dev/null +++ b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_hash.txt @@ -0,0 +1 @@ +fdf650e0bdc3326eccec3edf5adcc6683a1d516b7d8192a39e3fd1d27bfc423f \ No newline at end of file diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json new file mode 100644 index 000000000..23093ba1e --- /dev/null +++ b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json @@ -0,0 +1,7 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false, + "model_name": "Qwen/Qwen1.5-0.5B" +} \ No newline at end of file diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/input_meta.py b/samples/transformers-auto-model/Qwen1.5-0.5B/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/input_tensor_constraints.py b/samples/transformers-auto-model/Qwen1.5-0.5B/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/model.py b/samples/transformers-auto-model/Qwen1.5-0.5B/model.py new file mode 100644 index 000000000..db2c0729b --- /dev/null +++ b/samples/transformers-auto-model/Qwen1.5-0.5B/model.py @@ -0,0 +1,4498 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_kwargs_input_ids_: torch.Tensor, + L_self_modules_model_modules_embed_tokens_parameters_weight_: torch.nn.parameter.Parameter, + L_kwargs_attention_mask_: torch.Tensor, + L_self_modules_model_modules_rotary_emb_buffers_inv_freq_: torch.Tensor, + L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_model_modules_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_kwargs_input_ids_ = L_kwargs_input_ids_ + l_self_modules_model_modules_embed_tokens_parameters_weight_ = ( + L_self_modules_model_modules_embed_tokens_parameters_weight_ + ) + l_kwargs_attention_mask_ = L_kwargs_attention_mask_ + l_self_modules_model_modules_rotary_emb_buffers_inv_freq_ = ( + L_self_modules_model_modules_rotary_emb_buffers_inv_freq_ + ) + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_model_modules_norm_parameters_weight_ = ( + L_self_modules_model_modules_norm_parameters_weight_ + ) + inputs_embeds = torch.nn.functional.embedding( + l_kwargs_input_ids_, + l_self_modules_model_modules_embed_tokens_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_kwargs_input_ids_ = None + cache_position = torch.arange(0, 19, device=device(type="cuda", index=0)) + position_ids = cache_position.unsqueeze(0) + attention_mask = l_kwargs_attention_mask_.to( + device=device(type="cuda", index=0), dtype=torch.bool + ) + l_kwargs_attention_mask_ = None + kv_arange = torch.arange(19, device=device(type="cuda", index=0)) + kv_arange += 0 + kv_arange_1 = kv_arange + kv_arange = None + batch_arange = torch.arange(1, device=device(type="cuda", index=0)) + head_arange = torch.arange(1, device=device(type="cuda", index=0)) + lazy_load_decompositions = ( + torch._functorch.predispatch.lazy_load_decompositions() + ) + lazy_load_decompositions = None + _vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting( + 1, "error" + ) + _vmap_increment_nesting = None + child = torch._functorch.predispatch._add_batch_dim(batch_arange, 0, 1) + batch_arange = None + lazy_load_decompositions_1 = ( + torch._functorch.predispatch.lazy_load_decompositions() + ) + lazy_load_decompositions_1 = None + _vmap_increment_nesting_1 = ( + torch._functorch.predispatch._vmap_increment_nesting(1, "error") + ) + _vmap_increment_nesting_1 = None + child_1 = torch._functorch.predispatch._add_batch_dim(head_arange, 0, 2) + head_arange = child_1 = None + lazy_load_decompositions_2 = ( + torch._functorch.predispatch.lazy_load_decompositions() + ) + lazy_load_decompositions_2 = None + _vmap_increment_nesting_2 = ( + torch._functorch.predispatch._vmap_increment_nesting(19, "error") + ) + _vmap_increment_nesting_2 = None + child_2 = torch._functorch.predispatch._add_batch_dim(cache_position, 0, 3) + cache_position = None + lazy_load_decompositions_3 = ( + torch._functorch.predispatch.lazy_load_decompositions() + ) + lazy_load_decompositions_3 = None + _vmap_increment_nesting_3 = ( + torch._functorch.predispatch._vmap_increment_nesting(19, "error") + ) + _vmap_increment_nesting_3 = None + child_3 = torch._functorch.predispatch._add_batch_dim(kv_arange_1, 0, 4) + kv_arange_1 = None + result = child_2.new_ones((), dtype=torch.bool) + le = child_3.le(child_2) + child_2 = None + to_1 = le.to(device(type="cuda", index=0)) + le = None + result_1 = result.__and__(to_1) + result = to_1 = None + index = torch.ops.aten.index(attention_mask, [child, child_3]) + attention_mask = child = child_3 = None + to_2 = index.to(device(type="cuda", index=0)) + index = None + result_2 = result_1.__and__(to_2) + result_1 = to_2 = None + batched_outputs = torch._functorch.predispatch._remove_batch_dim( + result_2, 4, 19, 0 + ) + result_2 = None + _vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting() + _vmap_decrement_nesting = None + batched_outputs_1 = torch._functorch.predispatch._remove_batch_dim( + batched_outputs, 3, 19, 0 + ) + batched_outputs = None + _vmap_decrement_nesting_1 = ( + torch._functorch.predispatch._vmap_decrement_nesting() + ) + _vmap_decrement_nesting_1 = None + batched_outputs_2 = torch._functorch.predispatch._remove_batch_dim( + batched_outputs_1, 2, 1, 0 + ) + batched_outputs_1 = None + _vmap_decrement_nesting_2 = ( + torch._functorch.predispatch._vmap_decrement_nesting() + ) + _vmap_decrement_nesting_2 = None + causal_mask = torch._functorch.predispatch._remove_batch_dim( + batched_outputs_2, 1, 1, 0 + ) + batched_outputs_2 = None + _vmap_decrement_nesting_3 = ( + torch._functorch.predispatch._vmap_decrement_nesting() + ) + _vmap_decrement_nesting_3 = None + getitem = l_self_modules_model_modules_rotary_emb_buffers_inv_freq_[ + (None, slice(None, None, None), None) + ] + l_self_modules_model_modules_rotary_emb_buffers_inv_freq_ = None + float_1 = getitem.float() + getitem = None + expand = float_1.expand(1, -1, 1) + float_1 = None + inv_freq_expanded = expand.to(device(type="cuda", index=0)) + expand = None + getitem_1 = position_ids[ + (slice(None, None, None), None, slice(None, None, None)) + ] + position_ids = None + position_ids_expanded = getitem_1.float() + getitem_1 = None + float_3 = inv_freq_expanded.float() + inv_freq_expanded = None + float_4 = position_ids_expanded.float() + position_ids_expanded = None + matmul = float_3 @ float_4 + float_3 = float_4 = None + freqs = matmul.transpose(1, 2) + matmul = None + emb = torch.cat((freqs, freqs), dim=-1) + freqs = None + cos = emb.cos() + cos_1 = cos * 1.0 + cos = None + sin = emb.sin() + emb = None + sin_1 = sin * 1.0 + sin = None + cos_2 = cos_1.to(dtype=torch.float32) + cos_1 = None + sin_2 = sin_1.to(dtype=torch.float32) + sin_1 = None + _log_api_usage_once = torch._C._log_api_usage_once("python.nn_module") + _log_api_usage_once = None + hidden_states = inputs_embeds.to(torch.float32) + pow_1 = hidden_states.pow(2) + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-06 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + hidden_states = rsqrt = None + to_7 = hidden_states_1.to(torch.float32) + hidden_states_1 = None + hidden_states_2 = ( + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + * to_7 + ) + l_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = ( + to_7 + ) = None + linear = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view = linear.view((1, 19, -1, 64)) + linear = None + query_states = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_1 = linear_1.view((1, 19, -1, 64)) + linear_1 = None + key_states = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_2 = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_2 = linear_2.view((1, 19, -1, 64)) + linear_2 = None + value_states = view_2.transpose(1, 2) + view_2 = None + cos_3 = cos_2.unsqueeze(1) + sin_3 = sin_2.unsqueeze(1) + mul_4 = query_states * cos_3 + x1 = query_states[(Ellipsis, slice(None, 32, None))] + x2 = query_states[(Ellipsis, slice(32, None, None))] + query_states = None + neg = -x2 + x2 = None + cat_1 = torch.cat((neg, x1), dim=-1) + neg = x1 = None + mul_5 = cat_1 * sin_3 + cat_1 = None + q_embed = mul_4 + mul_5 + mul_4 = mul_5 = None + mul_6 = key_states * cos_3 + cos_3 = None + x1_1 = key_states[(Ellipsis, slice(None, 32, None))] + x2_1 = key_states[(Ellipsis, slice(32, None, None))] + key_states = None + neg_1 = -x2_1 + x2_1 = None + cat_2 = torch.cat((neg_1, x1_1), dim=-1) + neg_1 = x1_1 = None + mul_7 = cat_2 * sin_3 + cat_2 = sin_3 = None + k_embed = mul_6 + mul_7 + mul_6 = mul_7 = None + attention_mask_1 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output = torch._C._nn.scaled_dot_product_attention( + q_embed, + k_embed, + value_states, + attn_mask=attention_mask_1, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed = k_embed = value_states = attention_mask_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + reshape = attn_output_1.reshape(1, 19, -1) + attn_output_1 = None + attn_output_2 = reshape.contiguous() + reshape = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_3 = inputs_embeds + attn_output_3 + inputs_embeds = attn_output_3 = None + hidden_states_4 = hidden_states_3.to(torch.float32) + pow_2 = hidden_states_4.pow(2) + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_4 = variance_1 + 1e-06 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_4) + add_4 = None + hidden_states_5 = hidden_states_4 * rsqrt_1 + hidden_states_4 = rsqrt_1 = None + to_9 = hidden_states_5.to(torch.float32) + hidden_states_5 = None + hidden_states_6 = ( + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + * to_9 + ) + l_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = ( + to_9 + ) = None + linear_4 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu = torch.nn.functional.silu(linear_4, inplace=False) + linear_4 = None + linear_5 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_6 = l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_10 = silu * linear_5 + silu = linear_5 = None + down_proj = torch._C._nn.linear( + mul_10, + l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_10 = l_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_7 = hidden_states_3 + down_proj + hidden_states_3 = down_proj = None + hidden_states_8 = hidden_states_7.to(torch.float32) + pow_3 = hidden_states_8.pow(2) + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_6 = variance_2 + 1e-06 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_6) + add_6 = None + hidden_states_9 = hidden_states_8 * rsqrt_2 + hidden_states_8 = rsqrt_2 = None + to_11 = hidden_states_9.to(torch.float32) + hidden_states_9 = None + hidden_states_10 = ( + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + * to_11 + ) + l_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = ( + to_11 + ) = None + linear_7 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_3 = linear_7.view((1, 19, -1, 64)) + linear_7 = None + query_states_1 = view_3.transpose(1, 2) + view_3 = None + linear_8 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_4 = linear_8.view((1, 19, -1, 64)) + linear_8 = None + key_states_1 = view_4.transpose(1, 2) + view_4 = None + linear_9 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_10 = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_5 = linear_9.view((1, 19, -1, 64)) + linear_9 = None + value_states_1 = view_5.transpose(1, 2) + view_5 = None + cos_4 = cos_2.unsqueeze(1) + sin_4 = sin_2.unsqueeze(1) + mul_13 = query_states_1 * cos_4 + x1_2 = query_states_1[(Ellipsis, slice(None, 32, None))] + x2_2 = query_states_1[(Ellipsis, slice(32, None, None))] + query_states_1 = None + neg_2 = -x2_2 + x2_2 = None + cat_3 = torch.cat((neg_2, x1_2), dim=-1) + neg_2 = x1_2 = None + mul_14 = cat_3 * sin_4 + cat_3 = None + q_embed_1 = mul_13 + mul_14 + mul_13 = mul_14 = None + mul_15 = key_states_1 * cos_4 + cos_4 = None + x1_3 = key_states_1[(Ellipsis, slice(None, 32, None))] + x2_3 = key_states_1[(Ellipsis, slice(32, None, None))] + key_states_1 = None + neg_3 = -x2_3 + x2_3 = None + cat_4 = torch.cat((neg_3, x1_3), dim=-1) + neg_3 = x1_3 = None + mul_16 = cat_4 * sin_4 + cat_4 = sin_4 = None + k_embed_1 = mul_15 + mul_16 + mul_15 = mul_16 = None + attention_mask_2 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_4 = torch._C._nn.scaled_dot_product_attention( + q_embed_1, + k_embed_1, + value_states_1, + attn_mask=attention_mask_2, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_1 = k_embed_1 = value_states_1 = attention_mask_2 = None + transpose_8 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_8.contiguous() + transpose_8 = None + reshape_1 = attn_output_5.reshape(1, 19, -1) + attn_output_5 = None + attn_output_6 = reshape_1.contiguous() + reshape_1 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_11 = hidden_states_7 + attn_output_7 + hidden_states_7 = attn_output_7 = None + hidden_states_12 = hidden_states_11.to(torch.float32) + pow_4 = hidden_states_12.pow(2) + variance_3 = pow_4.mean(-1, keepdim=True) + pow_4 = None + add_10 = variance_3 + 1e-06 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_10) + add_10 = None + hidden_states_13 = hidden_states_12 * rsqrt_3 + hidden_states_12 = rsqrt_3 = None + to_13 = hidden_states_13.to(torch.float32) + hidden_states_13 = None + hidden_states_14 = ( + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + * to_13 + ) + l_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = ( + to_13 + ) = None + linear_11 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_1 = torch.nn.functional.silu(linear_11, inplace=False) + linear_11 = None + linear_12 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_14 = l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_19 = silu_1 * linear_12 + silu_1 = linear_12 = None + down_proj_1 = torch._C._nn.linear( + mul_19, + l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_19 = l_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_15 = hidden_states_11 + down_proj_1 + hidden_states_11 = down_proj_1 = None + hidden_states_16 = hidden_states_15.to(torch.float32) + pow_5 = hidden_states_16.pow(2) + variance_4 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_12 = variance_4 + 1e-06 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_12) + add_12 = None + hidden_states_17 = hidden_states_16 * rsqrt_4 + hidden_states_16 = rsqrt_4 = None + to_15 = hidden_states_17.to(torch.float32) + hidden_states_17 = None + hidden_states_18 = ( + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + * to_15 + ) + l_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = ( + to_15 + ) = None + linear_14 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_6 = linear_14.view((1, 19, -1, 64)) + linear_14 = None + query_states_2 = view_6.transpose(1, 2) + view_6 = None + linear_15 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_7 = linear_15.view((1, 19, -1, 64)) + linear_15 = None + key_states_2 = view_7.transpose(1, 2) + view_7 = None + linear_16 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_18 = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_8 = linear_16.view((1, 19, -1, 64)) + linear_16 = None + value_states_2 = view_8.transpose(1, 2) + view_8 = None + cos_5 = cos_2.unsqueeze(1) + sin_5 = sin_2.unsqueeze(1) + mul_22 = query_states_2 * cos_5 + x1_4 = query_states_2[(Ellipsis, slice(None, 32, None))] + x2_4 = query_states_2[(Ellipsis, slice(32, None, None))] + query_states_2 = None + neg_4 = -x2_4 + x2_4 = None + cat_5 = torch.cat((neg_4, x1_4), dim=-1) + neg_4 = x1_4 = None + mul_23 = cat_5 * sin_5 + cat_5 = None + q_embed_2 = mul_22 + mul_23 + mul_22 = mul_23 = None + mul_24 = key_states_2 * cos_5 + cos_5 = None + x1_5 = key_states_2[(Ellipsis, slice(None, 32, None))] + x2_5 = key_states_2[(Ellipsis, slice(32, None, None))] + key_states_2 = None + neg_5 = -x2_5 + x2_5 = None + cat_6 = torch.cat((neg_5, x1_5), dim=-1) + neg_5 = x1_5 = None + mul_25 = cat_6 * sin_5 + cat_6 = sin_5 = None + k_embed_2 = mul_24 + mul_25 + mul_24 = mul_25 = None + attention_mask_3 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_8 = torch._C._nn.scaled_dot_product_attention( + q_embed_2, + k_embed_2, + value_states_2, + attn_mask=attention_mask_3, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_2 = k_embed_2 = value_states_2 = attention_mask_3 = None + transpose_12 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_12.contiguous() + transpose_12 = None + reshape_2 = attn_output_9.reshape(1, 19, -1) + attn_output_9 = None + attn_output_10 = reshape_2.contiguous() + reshape_2 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_19 = hidden_states_15 + attn_output_11 + hidden_states_15 = attn_output_11 = None + hidden_states_20 = hidden_states_19.to(torch.float32) + pow_6 = hidden_states_20.pow(2) + variance_5 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_16 = variance_5 + 1e-06 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_16) + add_16 = None + hidden_states_21 = hidden_states_20 * rsqrt_5 + hidden_states_20 = rsqrt_5 = None + to_17 = hidden_states_21.to(torch.float32) + hidden_states_21 = None + hidden_states_22 = ( + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + * to_17 + ) + l_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = ( + to_17 + ) = None + linear_18 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_2 = torch.nn.functional.silu(linear_18, inplace=False) + linear_18 = None + linear_19 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_28 = silu_2 * linear_19 + silu_2 = linear_19 = None + down_proj_2 = torch._C._nn.linear( + mul_28, + l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_28 = l_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_23 = hidden_states_19 + down_proj_2 + hidden_states_19 = down_proj_2 = None + hidden_states_24 = hidden_states_23.to(torch.float32) + pow_7 = hidden_states_24.pow(2) + variance_6 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_18 = variance_6 + 1e-06 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_18) + add_18 = None + hidden_states_25 = hidden_states_24 * rsqrt_6 + hidden_states_24 = rsqrt_6 = None + to_19 = hidden_states_25.to(torch.float32) + hidden_states_25 = None + hidden_states_26 = ( + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + * to_19 + ) + l_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = ( + to_19 + ) = None + linear_21 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_9 = linear_21.view((1, 19, -1, 64)) + linear_21 = None + query_states_3 = view_9.transpose(1, 2) + view_9 = None + linear_22 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_10 = linear_22.view((1, 19, -1, 64)) + linear_22 = None + key_states_3 = view_10.transpose(1, 2) + view_10 = None + linear_23 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_26 = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_11 = linear_23.view((1, 19, -1, 64)) + linear_23 = None + value_states_3 = view_11.transpose(1, 2) + view_11 = None + cos_6 = cos_2.unsqueeze(1) + sin_6 = sin_2.unsqueeze(1) + mul_31 = query_states_3 * cos_6 + x1_6 = query_states_3[(Ellipsis, slice(None, 32, None))] + x2_6 = query_states_3[(Ellipsis, slice(32, None, None))] + query_states_3 = None + neg_6 = -x2_6 + x2_6 = None + cat_7 = torch.cat((neg_6, x1_6), dim=-1) + neg_6 = x1_6 = None + mul_32 = cat_7 * sin_6 + cat_7 = None + q_embed_3 = mul_31 + mul_32 + mul_31 = mul_32 = None + mul_33 = key_states_3 * cos_6 + cos_6 = None + x1_7 = key_states_3[(Ellipsis, slice(None, 32, None))] + x2_7 = key_states_3[(Ellipsis, slice(32, None, None))] + key_states_3 = None + neg_7 = -x2_7 + x2_7 = None + cat_8 = torch.cat((neg_7, x1_7), dim=-1) + neg_7 = x1_7 = None + mul_34 = cat_8 * sin_6 + cat_8 = sin_6 = None + k_embed_3 = mul_33 + mul_34 + mul_33 = mul_34 = None + attention_mask_4 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + q_embed_3, + k_embed_3, + value_states_3, + attn_mask=attention_mask_4, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_3 = k_embed_3 = value_states_3 = attention_mask_4 = None + transpose_16 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_16.contiguous() + transpose_16 = None + reshape_3 = attn_output_13.reshape(1, 19, -1) + attn_output_13 = None + attn_output_14 = reshape_3.contiguous() + reshape_3 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_27 = hidden_states_23 + attn_output_15 + hidden_states_23 = attn_output_15 = None + hidden_states_28 = hidden_states_27.to(torch.float32) + pow_8 = hidden_states_28.pow(2) + variance_7 = pow_8.mean(-1, keepdim=True) + pow_8 = None + add_22 = variance_7 + 1e-06 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_22) + add_22 = None + hidden_states_29 = hidden_states_28 * rsqrt_7 + hidden_states_28 = rsqrt_7 = None + to_21 = hidden_states_29.to(torch.float32) + hidden_states_29 = None + hidden_states_30 = ( + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + * to_21 + ) + l_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = ( + to_21 + ) = None + linear_25 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_3 = torch.nn.functional.silu(linear_25, inplace=False) + linear_25 = None + linear_26 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_30 = l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_37 = silu_3 * linear_26 + silu_3 = linear_26 = None + down_proj_3 = torch._C._nn.linear( + mul_37, + l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_37 = l_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_31 = hidden_states_27 + down_proj_3 + hidden_states_27 = down_proj_3 = None + hidden_states_32 = hidden_states_31.to(torch.float32) + pow_9 = hidden_states_32.pow(2) + variance_8 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_24 = variance_8 + 1e-06 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_24) + add_24 = None + hidden_states_33 = hidden_states_32 * rsqrt_8 + hidden_states_32 = rsqrt_8 = None + to_23 = hidden_states_33.to(torch.float32) + hidden_states_33 = None + hidden_states_34 = ( + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + * to_23 + ) + l_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = ( + to_23 + ) = None + linear_28 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_12 = linear_28.view((1, 19, -1, 64)) + linear_28 = None + query_states_4 = view_12.transpose(1, 2) + view_12 = None + linear_29 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_13 = linear_29.view((1, 19, -1, 64)) + linear_29 = None + key_states_4 = view_13.transpose(1, 2) + view_13 = None + linear_30 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_34 = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_14 = linear_30.view((1, 19, -1, 64)) + linear_30 = None + value_states_4 = view_14.transpose(1, 2) + view_14 = None + cos_7 = cos_2.unsqueeze(1) + sin_7 = sin_2.unsqueeze(1) + mul_40 = query_states_4 * cos_7 + x1_8 = query_states_4[(Ellipsis, slice(None, 32, None))] + x2_8 = query_states_4[(Ellipsis, slice(32, None, None))] + query_states_4 = None + neg_8 = -x2_8 + x2_8 = None + cat_9 = torch.cat((neg_8, x1_8), dim=-1) + neg_8 = x1_8 = None + mul_41 = cat_9 * sin_7 + cat_9 = None + q_embed_4 = mul_40 + mul_41 + mul_40 = mul_41 = None + mul_42 = key_states_4 * cos_7 + cos_7 = None + x1_9 = key_states_4[(Ellipsis, slice(None, 32, None))] + x2_9 = key_states_4[(Ellipsis, slice(32, None, None))] + key_states_4 = None + neg_9 = -x2_9 + x2_9 = None + cat_10 = torch.cat((neg_9, x1_9), dim=-1) + neg_9 = x1_9 = None + mul_43 = cat_10 * sin_7 + cat_10 = sin_7 = None + k_embed_4 = mul_42 + mul_43 + mul_42 = mul_43 = None + attention_mask_5 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_16 = torch._C._nn.scaled_dot_product_attention( + q_embed_4, + k_embed_4, + value_states_4, + attn_mask=attention_mask_5, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_4 = k_embed_4 = value_states_4 = attention_mask_5 = None + transpose_20 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_20.contiguous() + transpose_20 = None + reshape_4 = attn_output_17.reshape(1, 19, -1) + attn_output_17 = None + attn_output_18 = reshape_4.contiguous() + reshape_4 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_35 = hidden_states_31 + attn_output_19 + hidden_states_31 = attn_output_19 = None + hidden_states_36 = hidden_states_35.to(torch.float32) + pow_10 = hidden_states_36.pow(2) + variance_9 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_28 = variance_9 + 1e-06 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_28) + add_28 = None + hidden_states_37 = hidden_states_36 * rsqrt_9 + hidden_states_36 = rsqrt_9 = None + to_25 = hidden_states_37.to(torch.float32) + hidden_states_37 = None + hidden_states_38 = ( + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + * to_25 + ) + l_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = ( + to_25 + ) = None + linear_32 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_4 = torch.nn.functional.silu(linear_32, inplace=False) + linear_32 = None + linear_33 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_46 = silu_4 * linear_33 + silu_4 = linear_33 = None + down_proj_4 = torch._C._nn.linear( + mul_46, + l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_46 = l_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_39 = hidden_states_35 + down_proj_4 + hidden_states_35 = down_proj_4 = None + hidden_states_40 = hidden_states_39.to(torch.float32) + pow_11 = hidden_states_40.pow(2) + variance_10 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_30 = variance_10 + 1e-06 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_30) + add_30 = None + hidden_states_41 = hidden_states_40 * rsqrt_10 + hidden_states_40 = rsqrt_10 = None + to_27 = hidden_states_41.to(torch.float32) + hidden_states_41 = None + hidden_states_42 = ( + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + * to_27 + ) + l_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = ( + to_27 + ) = None + linear_35 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_15 = linear_35.view((1, 19, -1, 64)) + linear_35 = None + query_states_5 = view_15.transpose(1, 2) + view_15 = None + linear_36 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_16 = linear_36.view((1, 19, -1, 64)) + linear_36 = None + key_states_5 = view_16.transpose(1, 2) + view_16 = None + linear_37 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_42 = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_17 = linear_37.view((1, 19, -1, 64)) + linear_37 = None + value_states_5 = view_17.transpose(1, 2) + view_17 = None + cos_8 = cos_2.unsqueeze(1) + sin_8 = sin_2.unsqueeze(1) + mul_49 = query_states_5 * cos_8 + x1_10 = query_states_5[(Ellipsis, slice(None, 32, None))] + x2_10 = query_states_5[(Ellipsis, slice(32, None, None))] + query_states_5 = None + neg_10 = -x2_10 + x2_10 = None + cat_11 = torch.cat((neg_10, x1_10), dim=-1) + neg_10 = x1_10 = None + mul_50 = cat_11 * sin_8 + cat_11 = None + q_embed_5 = mul_49 + mul_50 + mul_49 = mul_50 = None + mul_51 = key_states_5 * cos_8 + cos_8 = None + x1_11 = key_states_5[(Ellipsis, slice(None, 32, None))] + x2_11 = key_states_5[(Ellipsis, slice(32, None, None))] + key_states_5 = None + neg_11 = -x2_11 + x2_11 = None + cat_12 = torch.cat((neg_11, x1_11), dim=-1) + neg_11 = x1_11 = None + mul_52 = cat_12 * sin_8 + cat_12 = sin_8 = None + k_embed_5 = mul_51 + mul_52 + mul_51 = mul_52 = None + attention_mask_6 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_20 = torch._C._nn.scaled_dot_product_attention( + q_embed_5, + k_embed_5, + value_states_5, + attn_mask=attention_mask_6, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_5 = k_embed_5 = value_states_5 = attention_mask_6 = None + transpose_24 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_24.contiguous() + transpose_24 = None + reshape_5 = attn_output_21.reshape(1, 19, -1) + attn_output_21 = None + attn_output_22 = reshape_5.contiguous() + reshape_5 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_43 = hidden_states_39 + attn_output_23 + hidden_states_39 = attn_output_23 = None + hidden_states_44 = hidden_states_43.to(torch.float32) + pow_12 = hidden_states_44.pow(2) + variance_11 = pow_12.mean(-1, keepdim=True) + pow_12 = None + add_34 = variance_11 + 1e-06 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_34) + add_34 = None + hidden_states_45 = hidden_states_44 * rsqrt_11 + hidden_states_44 = rsqrt_11 = None + to_29 = hidden_states_45.to(torch.float32) + hidden_states_45 = None + hidden_states_46 = ( + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + * to_29 + ) + l_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = ( + to_29 + ) = None + linear_39 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_5 = torch.nn.functional.silu(linear_39, inplace=False) + linear_39 = None + linear_40 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_46 = l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_55 = silu_5 * linear_40 + silu_5 = linear_40 = None + down_proj_5 = torch._C._nn.linear( + mul_55, + l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_55 = l_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_47 = hidden_states_43 + down_proj_5 + hidden_states_43 = down_proj_5 = None + hidden_states_48 = hidden_states_47.to(torch.float32) + pow_13 = hidden_states_48.pow(2) + variance_12 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_36 = variance_12 + 1e-06 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_36) + add_36 = None + hidden_states_49 = hidden_states_48 * rsqrt_12 + hidden_states_48 = rsqrt_12 = None + to_31 = hidden_states_49.to(torch.float32) + hidden_states_49 = None + hidden_states_50 = ( + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + * to_31 + ) + l_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = ( + to_31 + ) = None + linear_42 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_18 = linear_42.view((1, 19, -1, 64)) + linear_42 = None + query_states_6 = view_18.transpose(1, 2) + view_18 = None + linear_43 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_19 = linear_43.view((1, 19, -1, 64)) + linear_43 = None + key_states_6 = view_19.transpose(1, 2) + view_19 = None + linear_44 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_50 = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_20 = linear_44.view((1, 19, -1, 64)) + linear_44 = None + value_states_6 = view_20.transpose(1, 2) + view_20 = None + cos_9 = cos_2.unsqueeze(1) + sin_9 = sin_2.unsqueeze(1) + mul_58 = query_states_6 * cos_9 + x1_12 = query_states_6[(Ellipsis, slice(None, 32, None))] + x2_12 = query_states_6[(Ellipsis, slice(32, None, None))] + query_states_6 = None + neg_12 = -x2_12 + x2_12 = None + cat_13 = torch.cat((neg_12, x1_12), dim=-1) + neg_12 = x1_12 = None + mul_59 = cat_13 * sin_9 + cat_13 = None + q_embed_6 = mul_58 + mul_59 + mul_58 = mul_59 = None + mul_60 = key_states_6 * cos_9 + cos_9 = None + x1_13 = key_states_6[(Ellipsis, slice(None, 32, None))] + x2_13 = key_states_6[(Ellipsis, slice(32, None, None))] + key_states_6 = None + neg_13 = -x2_13 + x2_13 = None + cat_14 = torch.cat((neg_13, x1_13), dim=-1) + neg_13 = x1_13 = None + mul_61 = cat_14 * sin_9 + cat_14 = sin_9 = None + k_embed_6 = mul_60 + mul_61 + mul_60 = mul_61 = None + attention_mask_7 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + q_embed_6, + k_embed_6, + value_states_6, + attn_mask=attention_mask_7, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_6 = k_embed_6 = value_states_6 = attention_mask_7 = None + transpose_28 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_28.contiguous() + transpose_28 = None + reshape_6 = attn_output_25.reshape(1, 19, -1) + attn_output_25 = None + attn_output_26 = reshape_6.contiguous() + reshape_6 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_51 = hidden_states_47 + attn_output_27 + hidden_states_47 = attn_output_27 = None + hidden_states_52 = hidden_states_51.to(torch.float32) + pow_14 = hidden_states_52.pow(2) + variance_13 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_40 = variance_13 + 1e-06 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_40) + add_40 = None + hidden_states_53 = hidden_states_52 * rsqrt_13 + hidden_states_52 = rsqrt_13 = None + to_33 = hidden_states_53.to(torch.float32) + hidden_states_53 = None + hidden_states_54 = ( + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + * to_33 + ) + l_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = ( + to_33 + ) = None + linear_46 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_6 = torch.nn.functional.silu(linear_46, inplace=False) + linear_46 = None + linear_47 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_54 = l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_64 = silu_6 * linear_47 + silu_6 = linear_47 = None + down_proj_6 = torch._C._nn.linear( + mul_64, + l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_64 = l_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_55 = hidden_states_51 + down_proj_6 + hidden_states_51 = down_proj_6 = None + hidden_states_56 = hidden_states_55.to(torch.float32) + pow_15 = hidden_states_56.pow(2) + variance_14 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_42 = variance_14 + 1e-06 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_42) + add_42 = None + hidden_states_57 = hidden_states_56 * rsqrt_14 + hidden_states_56 = rsqrt_14 = None + to_35 = hidden_states_57.to(torch.float32) + hidden_states_57 = None + hidden_states_58 = ( + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + * to_35 + ) + l_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = ( + to_35 + ) = None + linear_49 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_21 = linear_49.view((1, 19, -1, 64)) + linear_49 = None + query_states_7 = view_21.transpose(1, 2) + view_21 = None + linear_50 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_22 = linear_50.view((1, 19, -1, 64)) + linear_50 = None + key_states_7 = view_22.transpose(1, 2) + view_22 = None + linear_51 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_58 = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_23 = linear_51.view((1, 19, -1, 64)) + linear_51 = None + value_states_7 = view_23.transpose(1, 2) + view_23 = None + cos_10 = cos_2.unsqueeze(1) + sin_10 = sin_2.unsqueeze(1) + mul_67 = query_states_7 * cos_10 + x1_14 = query_states_7[(Ellipsis, slice(None, 32, None))] + x2_14 = query_states_7[(Ellipsis, slice(32, None, None))] + query_states_7 = None + neg_14 = -x2_14 + x2_14 = None + cat_15 = torch.cat((neg_14, x1_14), dim=-1) + neg_14 = x1_14 = None + mul_68 = cat_15 * sin_10 + cat_15 = None + q_embed_7 = mul_67 + mul_68 + mul_67 = mul_68 = None + mul_69 = key_states_7 * cos_10 + cos_10 = None + x1_15 = key_states_7[(Ellipsis, slice(None, 32, None))] + x2_15 = key_states_7[(Ellipsis, slice(32, None, None))] + key_states_7 = None + neg_15 = -x2_15 + x2_15 = None + cat_16 = torch.cat((neg_15, x1_15), dim=-1) + neg_15 = x1_15 = None + mul_70 = cat_16 * sin_10 + cat_16 = sin_10 = None + k_embed_7 = mul_69 + mul_70 + mul_69 = mul_70 = None + attention_mask_8 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_28 = torch._C._nn.scaled_dot_product_attention( + q_embed_7, + k_embed_7, + value_states_7, + attn_mask=attention_mask_8, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_7 = k_embed_7 = value_states_7 = attention_mask_8 = None + transpose_32 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_32.contiguous() + transpose_32 = None + reshape_7 = attn_output_29.reshape(1, 19, -1) + attn_output_29 = None + attn_output_30 = reshape_7.contiguous() + reshape_7 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_59 = hidden_states_55 + attn_output_31 + hidden_states_55 = attn_output_31 = None + hidden_states_60 = hidden_states_59.to(torch.float32) + pow_16 = hidden_states_60.pow(2) + variance_15 = pow_16.mean(-1, keepdim=True) + pow_16 = None + add_46 = variance_15 + 1e-06 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_46) + add_46 = None + hidden_states_61 = hidden_states_60 * rsqrt_15 + hidden_states_60 = rsqrt_15 = None + to_37 = hidden_states_61.to(torch.float32) + hidden_states_61 = None + hidden_states_62 = ( + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + * to_37 + ) + l_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = ( + to_37 + ) = None + linear_53 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_7 = torch.nn.functional.silu(linear_53, inplace=False) + linear_53 = None + linear_54 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_73 = silu_7 * linear_54 + silu_7 = linear_54 = None + down_proj_7 = torch._C._nn.linear( + mul_73, + l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_73 = l_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_63 = hidden_states_59 + down_proj_7 + hidden_states_59 = down_proj_7 = None + hidden_states_64 = hidden_states_63.to(torch.float32) + pow_17 = hidden_states_64.pow(2) + variance_16 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_48 = variance_16 + 1e-06 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_48) + add_48 = None + hidden_states_65 = hidden_states_64 * rsqrt_16 + hidden_states_64 = rsqrt_16 = None + to_39 = hidden_states_65.to(torch.float32) + hidden_states_65 = None + hidden_states_66 = ( + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + * to_39 + ) + l_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = ( + to_39 + ) = None + linear_56 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_24 = linear_56.view((1, 19, -1, 64)) + linear_56 = None + query_states_8 = view_24.transpose(1, 2) + view_24 = None + linear_57 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_25 = linear_57.view((1, 19, -1, 64)) + linear_57 = None + key_states_8 = view_25.transpose(1, 2) + view_25 = None + linear_58 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_66 = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_26 = linear_58.view((1, 19, -1, 64)) + linear_58 = None + value_states_8 = view_26.transpose(1, 2) + view_26 = None + cos_11 = cos_2.unsqueeze(1) + sin_11 = sin_2.unsqueeze(1) + mul_76 = query_states_8 * cos_11 + x1_16 = query_states_8[(Ellipsis, slice(None, 32, None))] + x2_16 = query_states_8[(Ellipsis, slice(32, None, None))] + query_states_8 = None + neg_16 = -x2_16 + x2_16 = None + cat_17 = torch.cat((neg_16, x1_16), dim=-1) + neg_16 = x1_16 = None + mul_77 = cat_17 * sin_11 + cat_17 = None + q_embed_8 = mul_76 + mul_77 + mul_76 = mul_77 = None + mul_78 = key_states_8 * cos_11 + cos_11 = None + x1_17 = key_states_8[(Ellipsis, slice(None, 32, None))] + x2_17 = key_states_8[(Ellipsis, slice(32, None, None))] + key_states_8 = None + neg_17 = -x2_17 + x2_17 = None + cat_18 = torch.cat((neg_17, x1_17), dim=-1) + neg_17 = x1_17 = None + mul_79 = cat_18 * sin_11 + cat_18 = sin_11 = None + k_embed_8 = mul_78 + mul_79 + mul_78 = mul_79 = None + attention_mask_9 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_32 = torch._C._nn.scaled_dot_product_attention( + q_embed_8, + k_embed_8, + value_states_8, + attn_mask=attention_mask_9, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_8 = k_embed_8 = value_states_8 = attention_mask_9 = None + transpose_36 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_36.contiguous() + transpose_36 = None + reshape_8 = attn_output_33.reshape(1, 19, -1) + attn_output_33 = None + attn_output_34 = reshape_8.contiguous() + reshape_8 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_67 = hidden_states_63 + attn_output_35 + hidden_states_63 = attn_output_35 = None + hidden_states_68 = hidden_states_67.to(torch.float32) + pow_18 = hidden_states_68.pow(2) + variance_17 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_52 = variance_17 + 1e-06 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_52) + add_52 = None + hidden_states_69 = hidden_states_68 * rsqrt_17 + hidden_states_68 = rsqrt_17 = None + to_41 = hidden_states_69.to(torch.float32) + hidden_states_69 = None + hidden_states_70 = ( + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + * to_41 + ) + l_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = ( + to_41 + ) = None + linear_60 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_8 = torch.nn.functional.silu(linear_60, inplace=False) + linear_60 = None + linear_61 = torch._C._nn.linear( + hidden_states_70, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_70 = l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_82 = silu_8 * linear_61 + silu_8 = linear_61 = None + down_proj_8 = torch._C._nn.linear( + mul_82, + l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_82 = l_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_71 = hidden_states_67 + down_proj_8 + hidden_states_67 = down_proj_8 = None + hidden_states_72 = hidden_states_71.to(torch.float32) + pow_19 = hidden_states_72.pow(2) + variance_18 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_54 = variance_18 + 1e-06 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_54) + add_54 = None + hidden_states_73 = hidden_states_72 * rsqrt_18 + hidden_states_72 = rsqrt_18 = None + to_43 = hidden_states_73.to(torch.float32) + hidden_states_73 = None + hidden_states_74 = ( + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + * to_43 + ) + l_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = ( + to_43 + ) = None + linear_63 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_27 = linear_63.view((1, 19, -1, 64)) + linear_63 = None + query_states_9 = view_27.transpose(1, 2) + view_27 = None + linear_64 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_28 = linear_64.view((1, 19, -1, 64)) + linear_64 = None + key_states_9 = view_28.transpose(1, 2) + view_28 = None + linear_65 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_74 = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_29 = linear_65.view((1, 19, -1, 64)) + linear_65 = None + value_states_9 = view_29.transpose(1, 2) + view_29 = None + cos_12 = cos_2.unsqueeze(1) + sin_12 = sin_2.unsqueeze(1) + mul_85 = query_states_9 * cos_12 + x1_18 = query_states_9[(Ellipsis, slice(None, 32, None))] + x2_18 = query_states_9[(Ellipsis, slice(32, None, None))] + query_states_9 = None + neg_18 = -x2_18 + x2_18 = None + cat_19 = torch.cat((neg_18, x1_18), dim=-1) + neg_18 = x1_18 = None + mul_86 = cat_19 * sin_12 + cat_19 = None + q_embed_9 = mul_85 + mul_86 + mul_85 = mul_86 = None + mul_87 = key_states_9 * cos_12 + cos_12 = None + x1_19 = key_states_9[(Ellipsis, slice(None, 32, None))] + x2_19 = key_states_9[(Ellipsis, slice(32, None, None))] + key_states_9 = None + neg_19 = -x2_19 + x2_19 = None + cat_20 = torch.cat((neg_19, x1_19), dim=-1) + neg_19 = x1_19 = None + mul_88 = cat_20 * sin_12 + cat_20 = sin_12 = None + k_embed_9 = mul_87 + mul_88 + mul_87 = mul_88 = None + attention_mask_10 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + q_embed_9, + k_embed_9, + value_states_9, + attn_mask=attention_mask_10, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_9 = k_embed_9 = value_states_9 = attention_mask_10 = None + transpose_40 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_40.contiguous() + transpose_40 = None + reshape_9 = attn_output_37.reshape(1, 19, -1) + attn_output_37 = None + attn_output_38 = reshape_9.contiguous() + reshape_9 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_75 = hidden_states_71 + attn_output_39 + hidden_states_71 = attn_output_39 = None + hidden_states_76 = hidden_states_75.to(torch.float32) + pow_20 = hidden_states_76.pow(2) + variance_19 = pow_20.mean(-1, keepdim=True) + pow_20 = None + add_58 = variance_19 + 1e-06 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_58) + add_58 = None + hidden_states_77 = hidden_states_76 * rsqrt_19 + hidden_states_76 = rsqrt_19 = None + to_45 = hidden_states_77.to(torch.float32) + hidden_states_77 = None + hidden_states_78 = ( + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + * to_45 + ) + l_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = ( + to_45 + ) = None + linear_67 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_9 = torch.nn.functional.silu(linear_67, inplace=False) + linear_67 = None + linear_68 = torch._C._nn.linear( + hidden_states_78, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_78 = l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_91 = silu_9 * linear_68 + silu_9 = linear_68 = None + down_proj_9 = torch._C._nn.linear( + mul_91, + l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_91 = l_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_79 = hidden_states_75 + down_proj_9 + hidden_states_75 = down_proj_9 = None + hidden_states_80 = hidden_states_79.to(torch.float32) + pow_21 = hidden_states_80.pow(2) + variance_20 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_60 = variance_20 + 1e-06 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_60) + add_60 = None + hidden_states_81 = hidden_states_80 * rsqrt_20 + hidden_states_80 = rsqrt_20 = None + to_47 = hidden_states_81.to(torch.float32) + hidden_states_81 = None + hidden_states_82 = ( + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + * to_47 + ) + l_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = ( + to_47 + ) = None + linear_70 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_30 = linear_70.view((1, 19, -1, 64)) + linear_70 = None + query_states_10 = view_30.transpose(1, 2) + view_30 = None + linear_71 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_31 = linear_71.view((1, 19, -1, 64)) + linear_71 = None + key_states_10 = view_31.transpose(1, 2) + view_31 = None + linear_72 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_82 = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_32 = linear_72.view((1, 19, -1, 64)) + linear_72 = None + value_states_10 = view_32.transpose(1, 2) + view_32 = None + cos_13 = cos_2.unsqueeze(1) + sin_13 = sin_2.unsqueeze(1) + mul_94 = query_states_10 * cos_13 + x1_20 = query_states_10[(Ellipsis, slice(None, 32, None))] + x2_20 = query_states_10[(Ellipsis, slice(32, None, None))] + query_states_10 = None + neg_20 = -x2_20 + x2_20 = None + cat_21 = torch.cat((neg_20, x1_20), dim=-1) + neg_20 = x1_20 = None + mul_95 = cat_21 * sin_13 + cat_21 = None + q_embed_10 = mul_94 + mul_95 + mul_94 = mul_95 = None + mul_96 = key_states_10 * cos_13 + cos_13 = None + x1_21 = key_states_10[(Ellipsis, slice(None, 32, None))] + x2_21 = key_states_10[(Ellipsis, slice(32, None, None))] + key_states_10 = None + neg_21 = -x2_21 + x2_21 = None + cat_22 = torch.cat((neg_21, x1_21), dim=-1) + neg_21 = x1_21 = None + mul_97 = cat_22 * sin_13 + cat_22 = sin_13 = None + k_embed_10 = mul_96 + mul_97 + mul_96 = mul_97 = None + attention_mask_11 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_40 = torch._C._nn.scaled_dot_product_attention( + q_embed_10, + k_embed_10, + value_states_10, + attn_mask=attention_mask_11, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_10 = k_embed_10 = value_states_10 = attention_mask_11 = None + transpose_44 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_44.contiguous() + transpose_44 = None + reshape_10 = attn_output_41.reshape(1, 19, -1) + attn_output_41 = None + attn_output_42 = reshape_10.contiguous() + reshape_10 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_83 = hidden_states_79 + attn_output_43 + hidden_states_79 = attn_output_43 = None + hidden_states_84 = hidden_states_83.to(torch.float32) + pow_22 = hidden_states_84.pow(2) + variance_21 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_64 = variance_21 + 1e-06 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_64) + add_64 = None + hidden_states_85 = hidden_states_84 * rsqrt_21 + hidden_states_84 = rsqrt_21 = None + to_49 = hidden_states_85.to(torch.float32) + hidden_states_85 = None + hidden_states_86 = ( + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + * to_49 + ) + l_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = ( + to_49 + ) = None + linear_74 = torch._C._nn.linear( + hidden_states_86, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_10 = torch.nn.functional.silu(linear_74, inplace=False) + linear_74 = None + linear_75 = torch._C._nn.linear( + hidden_states_86, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_86 = l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_100 = silu_10 * linear_75 + silu_10 = linear_75 = None + down_proj_10 = torch._C._nn.linear( + mul_100, + l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_100 = l_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_87 = hidden_states_83 + down_proj_10 + hidden_states_83 = down_proj_10 = None + hidden_states_88 = hidden_states_87.to(torch.float32) + pow_23 = hidden_states_88.pow(2) + variance_22 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_66 = variance_22 + 1e-06 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_66) + add_66 = None + hidden_states_89 = hidden_states_88 * rsqrt_22 + hidden_states_88 = rsqrt_22 = None + to_51 = hidden_states_89.to(torch.float32) + hidden_states_89 = None + hidden_states_90 = ( + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + * to_51 + ) + l_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = ( + to_51 + ) = None + linear_77 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_33 = linear_77.view((1, 19, -1, 64)) + linear_77 = None + query_states_11 = view_33.transpose(1, 2) + view_33 = None + linear_78 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_34 = linear_78.view((1, 19, -1, 64)) + linear_78 = None + key_states_11 = view_34.transpose(1, 2) + view_34 = None + linear_79 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_90 = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_35 = linear_79.view((1, 19, -1, 64)) + linear_79 = None + value_states_11 = view_35.transpose(1, 2) + view_35 = None + cos_14 = cos_2.unsqueeze(1) + sin_14 = sin_2.unsqueeze(1) + mul_103 = query_states_11 * cos_14 + x1_22 = query_states_11[(Ellipsis, slice(None, 32, None))] + x2_22 = query_states_11[(Ellipsis, slice(32, None, None))] + query_states_11 = None + neg_22 = -x2_22 + x2_22 = None + cat_23 = torch.cat((neg_22, x1_22), dim=-1) + neg_22 = x1_22 = None + mul_104 = cat_23 * sin_14 + cat_23 = None + q_embed_11 = mul_103 + mul_104 + mul_103 = mul_104 = None + mul_105 = key_states_11 * cos_14 + cos_14 = None + x1_23 = key_states_11[(Ellipsis, slice(None, 32, None))] + x2_23 = key_states_11[(Ellipsis, slice(32, None, None))] + key_states_11 = None + neg_23 = -x2_23 + x2_23 = None + cat_24 = torch.cat((neg_23, x1_23), dim=-1) + neg_23 = x1_23 = None + mul_106 = cat_24 * sin_14 + cat_24 = sin_14 = None + k_embed_11 = mul_105 + mul_106 + mul_105 = mul_106 = None + attention_mask_12 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_44 = torch._C._nn.scaled_dot_product_attention( + q_embed_11, + k_embed_11, + value_states_11, + attn_mask=attention_mask_12, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_11 = k_embed_11 = value_states_11 = attention_mask_12 = None + transpose_48 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_48.contiguous() + transpose_48 = None + reshape_11 = attn_output_45.reshape(1, 19, -1) + attn_output_45 = None + attn_output_46 = reshape_11.contiguous() + reshape_11 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_91 = hidden_states_87 + attn_output_47 + hidden_states_87 = attn_output_47 = None + hidden_states_92 = hidden_states_91.to(torch.float32) + pow_24 = hidden_states_92.pow(2) + variance_23 = pow_24.mean(-1, keepdim=True) + pow_24 = None + add_70 = variance_23 + 1e-06 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_70) + add_70 = None + hidden_states_93 = hidden_states_92 * rsqrt_23 + hidden_states_92 = rsqrt_23 = None + to_53 = hidden_states_93.to(torch.float32) + hidden_states_93 = None + hidden_states_94 = ( + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + * to_53 + ) + l_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = ( + to_53 + ) = None + linear_81 = torch._C._nn.linear( + hidden_states_94, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_11 = torch.nn.functional.silu(linear_81, inplace=False) + linear_81 = None + linear_82 = torch._C._nn.linear( + hidden_states_94, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_94 = l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_109 = silu_11 * linear_82 + silu_11 = linear_82 = None + down_proj_11 = torch._C._nn.linear( + mul_109, + l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_109 = l_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_95 = hidden_states_91 + down_proj_11 + hidden_states_91 = down_proj_11 = None + hidden_states_96 = hidden_states_95.to(torch.float32) + pow_25 = hidden_states_96.pow(2) + variance_24 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_72 = variance_24 + 1e-06 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_72) + add_72 = None + hidden_states_97 = hidden_states_96 * rsqrt_24 + hidden_states_96 = rsqrt_24 = None + to_55 = hidden_states_97.to(torch.float32) + hidden_states_97 = None + hidden_states_98 = ( + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + * to_55 + ) + l_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = ( + to_55 + ) = None + linear_84 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_36 = linear_84.view((1, 19, -1, 64)) + linear_84 = None + query_states_12 = view_36.transpose(1, 2) + view_36 = None + linear_85 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_37 = linear_85.view((1, 19, -1, 64)) + linear_85 = None + key_states_12 = view_37.transpose(1, 2) + view_37 = None + linear_86 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_98 = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_38 = linear_86.view((1, 19, -1, 64)) + linear_86 = None + value_states_12 = view_38.transpose(1, 2) + view_38 = None + cos_15 = cos_2.unsqueeze(1) + sin_15 = sin_2.unsqueeze(1) + mul_112 = query_states_12 * cos_15 + x1_24 = query_states_12[(Ellipsis, slice(None, 32, None))] + x2_24 = query_states_12[(Ellipsis, slice(32, None, None))] + query_states_12 = None + neg_24 = -x2_24 + x2_24 = None + cat_25 = torch.cat((neg_24, x1_24), dim=-1) + neg_24 = x1_24 = None + mul_113 = cat_25 * sin_15 + cat_25 = None + q_embed_12 = mul_112 + mul_113 + mul_112 = mul_113 = None + mul_114 = key_states_12 * cos_15 + cos_15 = None + x1_25 = key_states_12[(Ellipsis, slice(None, 32, None))] + x2_25 = key_states_12[(Ellipsis, slice(32, None, None))] + key_states_12 = None + neg_25 = -x2_25 + x2_25 = None + cat_26 = torch.cat((neg_25, x1_25), dim=-1) + neg_25 = x1_25 = None + mul_115 = cat_26 * sin_15 + cat_26 = sin_15 = None + k_embed_12 = mul_114 + mul_115 + mul_114 = mul_115 = None + attention_mask_13 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + q_embed_12, + k_embed_12, + value_states_12, + attn_mask=attention_mask_13, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_12 = k_embed_12 = value_states_12 = attention_mask_13 = None + transpose_52 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_52.contiguous() + transpose_52 = None + reshape_12 = attn_output_49.reshape(1, 19, -1) + attn_output_49 = None + attn_output_50 = reshape_12.contiguous() + reshape_12 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_99 = hidden_states_95 + attn_output_51 + hidden_states_95 = attn_output_51 = None + hidden_states_100 = hidden_states_99.to(torch.float32) + pow_26 = hidden_states_100.pow(2) + variance_25 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_76 = variance_25 + 1e-06 + variance_25 = None + rsqrt_25 = torch.rsqrt(add_76) + add_76 = None + hidden_states_101 = hidden_states_100 * rsqrt_25 + hidden_states_100 = rsqrt_25 = None + to_57 = hidden_states_101.to(torch.float32) + hidden_states_101 = None + hidden_states_102 = ( + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + * to_57 + ) + l_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = ( + to_57 + ) = None + linear_88 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_12 = torch.nn.functional.silu(linear_88, inplace=False) + linear_88 = None + linear_89 = torch._C._nn.linear( + hidden_states_102, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_102 = l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_118 = silu_12 * linear_89 + silu_12 = linear_89 = None + down_proj_12 = torch._C._nn.linear( + mul_118, + l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_118 = l_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_103 = hidden_states_99 + down_proj_12 + hidden_states_99 = down_proj_12 = None + hidden_states_104 = hidden_states_103.to(torch.float32) + pow_27 = hidden_states_104.pow(2) + variance_26 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_78 = variance_26 + 1e-06 + variance_26 = None + rsqrt_26 = torch.rsqrt(add_78) + add_78 = None + hidden_states_105 = hidden_states_104 * rsqrt_26 + hidden_states_104 = rsqrt_26 = None + to_59 = hidden_states_105.to(torch.float32) + hidden_states_105 = None + hidden_states_106 = ( + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + * to_59 + ) + l_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = ( + to_59 + ) = None + linear_91 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_39 = linear_91.view((1, 19, -1, 64)) + linear_91 = None + query_states_13 = view_39.transpose(1, 2) + view_39 = None + linear_92 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_40 = linear_92.view((1, 19, -1, 64)) + linear_92 = None + key_states_13 = view_40.transpose(1, 2) + view_40 = None + linear_93 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_106 = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_41 = linear_93.view((1, 19, -1, 64)) + linear_93 = None + value_states_13 = view_41.transpose(1, 2) + view_41 = None + cos_16 = cos_2.unsqueeze(1) + sin_16 = sin_2.unsqueeze(1) + mul_121 = query_states_13 * cos_16 + x1_26 = query_states_13[(Ellipsis, slice(None, 32, None))] + x2_26 = query_states_13[(Ellipsis, slice(32, None, None))] + query_states_13 = None + neg_26 = -x2_26 + x2_26 = None + cat_27 = torch.cat((neg_26, x1_26), dim=-1) + neg_26 = x1_26 = None + mul_122 = cat_27 * sin_16 + cat_27 = None + q_embed_13 = mul_121 + mul_122 + mul_121 = mul_122 = None + mul_123 = key_states_13 * cos_16 + cos_16 = None + x1_27 = key_states_13[(Ellipsis, slice(None, 32, None))] + x2_27 = key_states_13[(Ellipsis, slice(32, None, None))] + key_states_13 = None + neg_27 = -x2_27 + x2_27 = None + cat_28 = torch.cat((neg_27, x1_27), dim=-1) + neg_27 = x1_27 = None + mul_124 = cat_28 * sin_16 + cat_28 = sin_16 = None + k_embed_13 = mul_123 + mul_124 + mul_123 = mul_124 = None + attention_mask_14 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_52 = torch._C._nn.scaled_dot_product_attention( + q_embed_13, + k_embed_13, + value_states_13, + attn_mask=attention_mask_14, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_13 = k_embed_13 = value_states_13 = attention_mask_14 = None + transpose_56 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_56.contiguous() + transpose_56 = None + reshape_13 = attn_output_53.reshape(1, 19, -1) + attn_output_53 = None + attn_output_54 = reshape_13.contiguous() + reshape_13 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_107 = hidden_states_103 + attn_output_55 + hidden_states_103 = attn_output_55 = None + hidden_states_108 = hidden_states_107.to(torch.float32) + pow_28 = hidden_states_108.pow(2) + variance_27 = pow_28.mean(-1, keepdim=True) + pow_28 = None + add_82 = variance_27 + 1e-06 + variance_27 = None + rsqrt_27 = torch.rsqrt(add_82) + add_82 = None + hidden_states_109 = hidden_states_108 * rsqrt_27 + hidden_states_108 = rsqrt_27 = None + to_61 = hidden_states_109.to(torch.float32) + hidden_states_109 = None + hidden_states_110 = ( + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + * to_61 + ) + l_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = ( + to_61 + ) = None + linear_95 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_13 = torch.nn.functional.silu(linear_95, inplace=False) + linear_95 = None + linear_96 = torch._C._nn.linear( + hidden_states_110, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_110 = l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_127 = silu_13 * linear_96 + silu_13 = linear_96 = None + down_proj_13 = torch._C._nn.linear( + mul_127, + l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_127 = l_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_111 = hidden_states_107 + down_proj_13 + hidden_states_107 = down_proj_13 = None + hidden_states_112 = hidden_states_111.to(torch.float32) + pow_29 = hidden_states_112.pow(2) + variance_28 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_84 = variance_28 + 1e-06 + variance_28 = None + rsqrt_28 = torch.rsqrt(add_84) + add_84 = None + hidden_states_113 = hidden_states_112 * rsqrt_28 + hidden_states_112 = rsqrt_28 = None + to_63 = hidden_states_113.to(torch.float32) + hidden_states_113 = None + hidden_states_114 = ( + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + * to_63 + ) + l_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = ( + to_63 + ) = None + linear_98 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_42 = linear_98.view((1, 19, -1, 64)) + linear_98 = None + query_states_14 = view_42.transpose(1, 2) + view_42 = None + linear_99 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_43 = linear_99.view((1, 19, -1, 64)) + linear_99 = None + key_states_14 = view_43.transpose(1, 2) + view_43 = None + linear_100 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_114 = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_44 = linear_100.view((1, 19, -1, 64)) + linear_100 = None + value_states_14 = view_44.transpose(1, 2) + view_44 = None + cos_17 = cos_2.unsqueeze(1) + sin_17 = sin_2.unsqueeze(1) + mul_130 = query_states_14 * cos_17 + x1_28 = query_states_14[(Ellipsis, slice(None, 32, None))] + x2_28 = query_states_14[(Ellipsis, slice(32, None, None))] + query_states_14 = None + neg_28 = -x2_28 + x2_28 = None + cat_29 = torch.cat((neg_28, x1_28), dim=-1) + neg_28 = x1_28 = None + mul_131 = cat_29 * sin_17 + cat_29 = None + q_embed_14 = mul_130 + mul_131 + mul_130 = mul_131 = None + mul_132 = key_states_14 * cos_17 + cos_17 = None + x1_29 = key_states_14[(Ellipsis, slice(None, 32, None))] + x2_29 = key_states_14[(Ellipsis, slice(32, None, None))] + key_states_14 = None + neg_29 = -x2_29 + x2_29 = None + cat_30 = torch.cat((neg_29, x1_29), dim=-1) + neg_29 = x1_29 = None + mul_133 = cat_30 * sin_17 + cat_30 = sin_17 = None + k_embed_14 = mul_132 + mul_133 + mul_132 = mul_133 = None + attention_mask_15 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_56 = torch._C._nn.scaled_dot_product_attention( + q_embed_14, + k_embed_14, + value_states_14, + attn_mask=attention_mask_15, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_14 = k_embed_14 = value_states_14 = attention_mask_15 = None + transpose_60 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_60.contiguous() + transpose_60 = None + reshape_14 = attn_output_57.reshape(1, 19, -1) + attn_output_57 = None + attn_output_58 = reshape_14.contiguous() + reshape_14 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_115 = hidden_states_111 + attn_output_59 + hidden_states_111 = attn_output_59 = None + hidden_states_116 = hidden_states_115.to(torch.float32) + pow_30 = hidden_states_116.pow(2) + variance_29 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_88 = variance_29 + 1e-06 + variance_29 = None + rsqrt_29 = torch.rsqrt(add_88) + add_88 = None + hidden_states_117 = hidden_states_116 * rsqrt_29 + hidden_states_116 = rsqrt_29 = None + to_65 = hidden_states_117.to(torch.float32) + hidden_states_117 = None + hidden_states_118 = ( + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + * to_65 + ) + l_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = ( + to_65 + ) = None + linear_102 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_14 = torch.nn.functional.silu(linear_102, inplace=False) + linear_102 = None + linear_103 = torch._C._nn.linear( + hidden_states_118, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_118 = l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_136 = silu_14 * linear_103 + silu_14 = linear_103 = None + down_proj_14 = torch._C._nn.linear( + mul_136, + l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_136 = l_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_119 = hidden_states_115 + down_proj_14 + hidden_states_115 = down_proj_14 = None + hidden_states_120 = hidden_states_119.to(torch.float32) + pow_31 = hidden_states_120.pow(2) + variance_30 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_90 = variance_30 + 1e-06 + variance_30 = None + rsqrt_30 = torch.rsqrt(add_90) + add_90 = None + hidden_states_121 = hidden_states_120 * rsqrt_30 + hidden_states_120 = rsqrt_30 = None + to_67 = hidden_states_121.to(torch.float32) + hidden_states_121 = None + hidden_states_122 = ( + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + * to_67 + ) + l_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = ( + to_67 + ) = None + linear_105 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_45 = linear_105.view((1, 19, -1, 64)) + linear_105 = None + query_states_15 = view_45.transpose(1, 2) + view_45 = None + linear_106 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_46 = linear_106.view((1, 19, -1, 64)) + linear_106 = None + key_states_15 = view_46.transpose(1, 2) + view_46 = None + linear_107 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_122 = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_47 = linear_107.view((1, 19, -1, 64)) + linear_107 = None + value_states_15 = view_47.transpose(1, 2) + view_47 = None + cos_18 = cos_2.unsqueeze(1) + sin_18 = sin_2.unsqueeze(1) + mul_139 = query_states_15 * cos_18 + x1_30 = query_states_15[(Ellipsis, slice(None, 32, None))] + x2_30 = query_states_15[(Ellipsis, slice(32, None, None))] + query_states_15 = None + neg_30 = -x2_30 + x2_30 = None + cat_31 = torch.cat((neg_30, x1_30), dim=-1) + neg_30 = x1_30 = None + mul_140 = cat_31 * sin_18 + cat_31 = None + q_embed_15 = mul_139 + mul_140 + mul_139 = mul_140 = None + mul_141 = key_states_15 * cos_18 + cos_18 = None + x1_31 = key_states_15[(Ellipsis, slice(None, 32, None))] + x2_31 = key_states_15[(Ellipsis, slice(32, None, None))] + key_states_15 = None + neg_31 = -x2_31 + x2_31 = None + cat_32 = torch.cat((neg_31, x1_31), dim=-1) + neg_31 = x1_31 = None + mul_142 = cat_32 * sin_18 + cat_32 = sin_18 = None + k_embed_15 = mul_141 + mul_142 + mul_141 = mul_142 = None + attention_mask_16 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + q_embed_15, + k_embed_15, + value_states_15, + attn_mask=attention_mask_16, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_15 = k_embed_15 = value_states_15 = attention_mask_16 = None + transpose_64 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_64.contiguous() + transpose_64 = None + reshape_15 = attn_output_61.reshape(1, 19, -1) + attn_output_61 = None + attn_output_62 = reshape_15.contiguous() + reshape_15 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_123 = hidden_states_119 + attn_output_63 + hidden_states_119 = attn_output_63 = None + hidden_states_124 = hidden_states_123.to(torch.float32) + pow_32 = hidden_states_124.pow(2) + variance_31 = pow_32.mean(-1, keepdim=True) + pow_32 = None + add_94 = variance_31 + 1e-06 + variance_31 = None + rsqrt_31 = torch.rsqrt(add_94) + add_94 = None + hidden_states_125 = hidden_states_124 * rsqrt_31 + hidden_states_124 = rsqrt_31 = None + to_69 = hidden_states_125.to(torch.float32) + hidden_states_125 = None + hidden_states_126 = ( + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + * to_69 + ) + l_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = ( + to_69 + ) = None + linear_109 = torch._C._nn.linear( + hidden_states_126, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_15 = torch.nn.functional.silu(linear_109, inplace=False) + linear_109 = None + linear_110 = torch._C._nn.linear( + hidden_states_126, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_126 = l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_145 = silu_15 * linear_110 + silu_15 = linear_110 = None + down_proj_15 = torch._C._nn.linear( + mul_145, + l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_145 = l_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_127 = hidden_states_123 + down_proj_15 + hidden_states_123 = down_proj_15 = None + hidden_states_128 = hidden_states_127.to(torch.float32) + pow_33 = hidden_states_128.pow(2) + variance_32 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_96 = variance_32 + 1e-06 + variance_32 = None + rsqrt_32 = torch.rsqrt(add_96) + add_96 = None + hidden_states_129 = hidden_states_128 * rsqrt_32 + hidden_states_128 = rsqrt_32 = None + to_71 = hidden_states_129.to(torch.float32) + hidden_states_129 = None + hidden_states_130 = ( + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + * to_71 + ) + l_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = ( + to_71 + ) = None + linear_112 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_48 = linear_112.view((1, 19, -1, 64)) + linear_112 = None + query_states_16 = view_48.transpose(1, 2) + view_48 = None + linear_113 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_49 = linear_113.view((1, 19, -1, 64)) + linear_113 = None + key_states_16 = view_49.transpose(1, 2) + view_49 = None + linear_114 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_130 = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_50 = linear_114.view((1, 19, -1, 64)) + linear_114 = None + value_states_16 = view_50.transpose(1, 2) + view_50 = None + cos_19 = cos_2.unsqueeze(1) + sin_19 = sin_2.unsqueeze(1) + mul_148 = query_states_16 * cos_19 + x1_32 = query_states_16[(Ellipsis, slice(None, 32, None))] + x2_32 = query_states_16[(Ellipsis, slice(32, None, None))] + query_states_16 = None + neg_32 = -x2_32 + x2_32 = None + cat_33 = torch.cat((neg_32, x1_32), dim=-1) + neg_32 = x1_32 = None + mul_149 = cat_33 * sin_19 + cat_33 = None + q_embed_16 = mul_148 + mul_149 + mul_148 = mul_149 = None + mul_150 = key_states_16 * cos_19 + cos_19 = None + x1_33 = key_states_16[(Ellipsis, slice(None, 32, None))] + x2_33 = key_states_16[(Ellipsis, slice(32, None, None))] + key_states_16 = None + neg_33 = -x2_33 + x2_33 = None + cat_34 = torch.cat((neg_33, x1_33), dim=-1) + neg_33 = x1_33 = None + mul_151 = cat_34 * sin_19 + cat_34 = sin_19 = None + k_embed_16 = mul_150 + mul_151 + mul_150 = mul_151 = None + attention_mask_17 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_64 = torch._C._nn.scaled_dot_product_attention( + q_embed_16, + k_embed_16, + value_states_16, + attn_mask=attention_mask_17, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_16 = k_embed_16 = value_states_16 = attention_mask_17 = None + transpose_68 = attn_output_64.transpose(1, 2) + attn_output_64 = None + attn_output_65 = transpose_68.contiguous() + transpose_68 = None + reshape_16 = attn_output_65.reshape(1, 19, -1) + attn_output_65 = None + attn_output_66 = reshape_16.contiguous() + reshape_16 = None + attn_output_67 = torch._C._nn.linear( + attn_output_66, + l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_66 = l_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_131 = hidden_states_127 + attn_output_67 + hidden_states_127 = attn_output_67 = None + hidden_states_132 = hidden_states_131.to(torch.float32) + pow_34 = hidden_states_132.pow(2) + variance_33 = pow_34.mean(-1, keepdim=True) + pow_34 = None + add_100 = variance_33 + 1e-06 + variance_33 = None + rsqrt_33 = torch.rsqrt(add_100) + add_100 = None + hidden_states_133 = hidden_states_132 * rsqrt_33 + hidden_states_132 = rsqrt_33 = None + to_73 = hidden_states_133.to(torch.float32) + hidden_states_133 = None + hidden_states_134 = ( + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + * to_73 + ) + l_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = ( + to_73 + ) = None + linear_116 = torch._C._nn.linear( + hidden_states_134, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_16 = torch.nn.functional.silu(linear_116, inplace=False) + linear_116 = None + linear_117 = torch._C._nn.linear( + hidden_states_134, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_134 = l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_154 = silu_16 * linear_117 + silu_16 = linear_117 = None + down_proj_16 = torch._C._nn.linear( + mul_154, + l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_154 = l_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_135 = hidden_states_131 + down_proj_16 + hidden_states_131 = down_proj_16 = None + hidden_states_136 = hidden_states_135.to(torch.float32) + pow_35 = hidden_states_136.pow(2) + variance_34 = pow_35.mean(-1, keepdim=True) + pow_35 = None + add_102 = variance_34 + 1e-06 + variance_34 = None + rsqrt_34 = torch.rsqrt(add_102) + add_102 = None + hidden_states_137 = hidden_states_136 * rsqrt_34 + hidden_states_136 = rsqrt_34 = None + to_75 = hidden_states_137.to(torch.float32) + hidden_states_137 = None + hidden_states_138 = ( + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + * to_75 + ) + l_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = ( + to_75 + ) = None + linear_119 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_51 = linear_119.view((1, 19, -1, 64)) + linear_119 = None + query_states_17 = view_51.transpose(1, 2) + view_51 = None + linear_120 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_52 = linear_120.view((1, 19, -1, 64)) + linear_120 = None + key_states_17 = view_52.transpose(1, 2) + view_52 = None + linear_121 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_138 = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_53 = linear_121.view((1, 19, -1, 64)) + linear_121 = None + value_states_17 = view_53.transpose(1, 2) + view_53 = None + cos_20 = cos_2.unsqueeze(1) + sin_20 = sin_2.unsqueeze(1) + mul_157 = query_states_17 * cos_20 + x1_34 = query_states_17[(Ellipsis, slice(None, 32, None))] + x2_34 = query_states_17[(Ellipsis, slice(32, None, None))] + query_states_17 = None + neg_34 = -x2_34 + x2_34 = None + cat_35 = torch.cat((neg_34, x1_34), dim=-1) + neg_34 = x1_34 = None + mul_158 = cat_35 * sin_20 + cat_35 = None + q_embed_17 = mul_157 + mul_158 + mul_157 = mul_158 = None + mul_159 = key_states_17 * cos_20 + cos_20 = None + x1_35 = key_states_17[(Ellipsis, slice(None, 32, None))] + x2_35 = key_states_17[(Ellipsis, slice(32, None, None))] + key_states_17 = None + neg_35 = -x2_35 + x2_35 = None + cat_36 = torch.cat((neg_35, x1_35), dim=-1) + neg_35 = x1_35 = None + mul_160 = cat_36 * sin_20 + cat_36 = sin_20 = None + k_embed_17 = mul_159 + mul_160 + mul_159 = mul_160 = None + attention_mask_18 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_68 = torch._C._nn.scaled_dot_product_attention( + q_embed_17, + k_embed_17, + value_states_17, + attn_mask=attention_mask_18, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_17 = k_embed_17 = value_states_17 = attention_mask_18 = None + transpose_72 = attn_output_68.transpose(1, 2) + attn_output_68 = None + attn_output_69 = transpose_72.contiguous() + transpose_72 = None + reshape_17 = attn_output_69.reshape(1, 19, -1) + attn_output_69 = None + attn_output_70 = reshape_17.contiguous() + reshape_17 = None + attn_output_71 = torch._C._nn.linear( + attn_output_70, + l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_70 = l_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_139 = hidden_states_135 + attn_output_71 + hidden_states_135 = attn_output_71 = None + hidden_states_140 = hidden_states_139.to(torch.float32) + pow_36 = hidden_states_140.pow(2) + variance_35 = pow_36.mean(-1, keepdim=True) + pow_36 = None + add_106 = variance_35 + 1e-06 + variance_35 = None + rsqrt_35 = torch.rsqrt(add_106) + add_106 = None + hidden_states_141 = hidden_states_140 * rsqrt_35 + hidden_states_140 = rsqrt_35 = None + to_77 = hidden_states_141.to(torch.float32) + hidden_states_141 = None + hidden_states_142 = ( + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + * to_77 + ) + l_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = ( + to_77 + ) = None + linear_123 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_17 = torch.nn.functional.silu(linear_123, inplace=False) + linear_123 = None + linear_124 = torch._C._nn.linear( + hidden_states_142, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_142 = l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_163 = silu_17 * linear_124 + silu_17 = linear_124 = None + down_proj_17 = torch._C._nn.linear( + mul_163, + l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_163 = l_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_143 = hidden_states_139 + down_proj_17 + hidden_states_139 = down_proj_17 = None + hidden_states_144 = hidden_states_143.to(torch.float32) + pow_37 = hidden_states_144.pow(2) + variance_36 = pow_37.mean(-1, keepdim=True) + pow_37 = None + add_108 = variance_36 + 1e-06 + variance_36 = None + rsqrt_36 = torch.rsqrt(add_108) + add_108 = None + hidden_states_145 = hidden_states_144 * rsqrt_36 + hidden_states_144 = rsqrt_36 = None + to_79 = hidden_states_145.to(torch.float32) + hidden_states_145 = None + hidden_states_146 = ( + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + * to_79 + ) + l_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = ( + to_79 + ) = None + linear_126 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_54 = linear_126.view((1, 19, -1, 64)) + linear_126 = None + query_states_18 = view_54.transpose(1, 2) + view_54 = None + linear_127 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_55 = linear_127.view((1, 19, -1, 64)) + linear_127 = None + key_states_18 = view_55.transpose(1, 2) + view_55 = None + linear_128 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_146 = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_56 = linear_128.view((1, 19, -1, 64)) + linear_128 = None + value_states_18 = view_56.transpose(1, 2) + view_56 = None + cos_21 = cos_2.unsqueeze(1) + sin_21 = sin_2.unsqueeze(1) + mul_166 = query_states_18 * cos_21 + x1_36 = query_states_18[(Ellipsis, slice(None, 32, None))] + x2_36 = query_states_18[(Ellipsis, slice(32, None, None))] + query_states_18 = None + neg_36 = -x2_36 + x2_36 = None + cat_37 = torch.cat((neg_36, x1_36), dim=-1) + neg_36 = x1_36 = None + mul_167 = cat_37 * sin_21 + cat_37 = None + q_embed_18 = mul_166 + mul_167 + mul_166 = mul_167 = None + mul_168 = key_states_18 * cos_21 + cos_21 = None + x1_37 = key_states_18[(Ellipsis, slice(None, 32, None))] + x2_37 = key_states_18[(Ellipsis, slice(32, None, None))] + key_states_18 = None + neg_37 = -x2_37 + x2_37 = None + cat_38 = torch.cat((neg_37, x1_37), dim=-1) + neg_37 = x1_37 = None + mul_169 = cat_38 * sin_21 + cat_38 = sin_21 = None + k_embed_18 = mul_168 + mul_169 + mul_168 = mul_169 = None + attention_mask_19 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_72 = torch._C._nn.scaled_dot_product_attention( + q_embed_18, + k_embed_18, + value_states_18, + attn_mask=attention_mask_19, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_18 = k_embed_18 = value_states_18 = attention_mask_19 = None + transpose_76 = attn_output_72.transpose(1, 2) + attn_output_72 = None + attn_output_73 = transpose_76.contiguous() + transpose_76 = None + reshape_18 = attn_output_73.reshape(1, 19, -1) + attn_output_73 = None + attn_output_74 = reshape_18.contiguous() + reshape_18 = None + attn_output_75 = torch._C._nn.linear( + attn_output_74, + l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_74 = l_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_147 = hidden_states_143 + attn_output_75 + hidden_states_143 = attn_output_75 = None + hidden_states_148 = hidden_states_147.to(torch.float32) + pow_38 = hidden_states_148.pow(2) + variance_37 = pow_38.mean(-1, keepdim=True) + pow_38 = None + add_112 = variance_37 + 1e-06 + variance_37 = None + rsqrt_37 = torch.rsqrt(add_112) + add_112 = None + hidden_states_149 = hidden_states_148 * rsqrt_37 + hidden_states_148 = rsqrt_37 = None + to_81 = hidden_states_149.to(torch.float32) + hidden_states_149 = None + hidden_states_150 = ( + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + * to_81 + ) + l_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = ( + to_81 + ) = None + linear_130 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_18 = torch.nn.functional.silu(linear_130, inplace=False) + linear_130 = None + linear_131 = torch._C._nn.linear( + hidden_states_150, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_150 = l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_172 = silu_18 * linear_131 + silu_18 = linear_131 = None + down_proj_18 = torch._C._nn.linear( + mul_172, + l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_172 = l_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_151 = hidden_states_147 + down_proj_18 + hidden_states_147 = down_proj_18 = None + hidden_states_152 = hidden_states_151.to(torch.float32) + pow_39 = hidden_states_152.pow(2) + variance_38 = pow_39.mean(-1, keepdim=True) + pow_39 = None + add_114 = variance_38 + 1e-06 + variance_38 = None + rsqrt_38 = torch.rsqrt(add_114) + add_114 = None + hidden_states_153 = hidden_states_152 * rsqrt_38 + hidden_states_152 = rsqrt_38 = None + to_83 = hidden_states_153.to(torch.float32) + hidden_states_153 = None + hidden_states_154 = ( + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + * to_83 + ) + l_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = ( + to_83 + ) = None + linear_133 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_57 = linear_133.view((1, 19, -1, 64)) + linear_133 = None + query_states_19 = view_57.transpose(1, 2) + view_57 = None + linear_134 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_58 = linear_134.view((1, 19, -1, 64)) + linear_134 = None + key_states_19 = view_58.transpose(1, 2) + view_58 = None + linear_135 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_154 = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_59 = linear_135.view((1, 19, -1, 64)) + linear_135 = None + value_states_19 = view_59.transpose(1, 2) + view_59 = None + cos_22 = cos_2.unsqueeze(1) + sin_22 = sin_2.unsqueeze(1) + mul_175 = query_states_19 * cos_22 + x1_38 = query_states_19[(Ellipsis, slice(None, 32, None))] + x2_38 = query_states_19[(Ellipsis, slice(32, None, None))] + query_states_19 = None + neg_38 = -x2_38 + x2_38 = None + cat_39 = torch.cat((neg_38, x1_38), dim=-1) + neg_38 = x1_38 = None + mul_176 = cat_39 * sin_22 + cat_39 = None + q_embed_19 = mul_175 + mul_176 + mul_175 = mul_176 = None + mul_177 = key_states_19 * cos_22 + cos_22 = None + x1_39 = key_states_19[(Ellipsis, slice(None, 32, None))] + x2_39 = key_states_19[(Ellipsis, slice(32, None, None))] + key_states_19 = None + neg_39 = -x2_39 + x2_39 = None + cat_40 = torch.cat((neg_39, x1_39), dim=-1) + neg_39 = x1_39 = None + mul_178 = cat_40 * sin_22 + cat_40 = sin_22 = None + k_embed_19 = mul_177 + mul_178 + mul_177 = mul_178 = None + attention_mask_20 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_76 = torch._C._nn.scaled_dot_product_attention( + q_embed_19, + k_embed_19, + value_states_19, + attn_mask=attention_mask_20, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_19 = k_embed_19 = value_states_19 = attention_mask_20 = None + transpose_80 = attn_output_76.transpose(1, 2) + attn_output_76 = None + attn_output_77 = transpose_80.contiguous() + transpose_80 = None + reshape_19 = attn_output_77.reshape(1, 19, -1) + attn_output_77 = None + attn_output_78 = reshape_19.contiguous() + reshape_19 = None + attn_output_79 = torch._C._nn.linear( + attn_output_78, + l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_78 = l_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_155 = hidden_states_151 + attn_output_79 + hidden_states_151 = attn_output_79 = None + hidden_states_156 = hidden_states_155.to(torch.float32) + pow_40 = hidden_states_156.pow(2) + variance_39 = pow_40.mean(-1, keepdim=True) + pow_40 = None + add_118 = variance_39 + 1e-06 + variance_39 = None + rsqrt_39 = torch.rsqrt(add_118) + add_118 = None + hidden_states_157 = hidden_states_156 * rsqrt_39 + hidden_states_156 = rsqrt_39 = None + to_85 = hidden_states_157.to(torch.float32) + hidden_states_157 = None + hidden_states_158 = ( + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + * to_85 + ) + l_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = ( + to_85 + ) = None + linear_137 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_19 = torch.nn.functional.silu(linear_137, inplace=False) + linear_137 = None + linear_138 = torch._C._nn.linear( + hidden_states_158, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_158 = l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_181 = silu_19 * linear_138 + silu_19 = linear_138 = None + down_proj_19 = torch._C._nn.linear( + mul_181, + l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_181 = l_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_159 = hidden_states_155 + down_proj_19 + hidden_states_155 = down_proj_19 = None + hidden_states_160 = hidden_states_159.to(torch.float32) + pow_41 = hidden_states_160.pow(2) + variance_40 = pow_41.mean(-1, keepdim=True) + pow_41 = None + add_120 = variance_40 + 1e-06 + variance_40 = None + rsqrt_40 = torch.rsqrt(add_120) + add_120 = None + hidden_states_161 = hidden_states_160 * rsqrt_40 + hidden_states_160 = rsqrt_40 = None + to_87 = hidden_states_161.to(torch.float32) + hidden_states_161 = None + hidden_states_162 = ( + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + * to_87 + ) + l_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = ( + to_87 + ) = None + linear_140 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_60 = linear_140.view((1, 19, -1, 64)) + linear_140 = None + query_states_20 = view_60.transpose(1, 2) + view_60 = None + linear_141 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_61 = linear_141.view((1, 19, -1, 64)) + linear_141 = None + key_states_20 = view_61.transpose(1, 2) + view_61 = None + linear_142 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_162 = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_62 = linear_142.view((1, 19, -1, 64)) + linear_142 = None + value_states_20 = view_62.transpose(1, 2) + view_62 = None + cos_23 = cos_2.unsqueeze(1) + sin_23 = sin_2.unsqueeze(1) + mul_184 = query_states_20 * cos_23 + x1_40 = query_states_20[(Ellipsis, slice(None, 32, None))] + x2_40 = query_states_20[(Ellipsis, slice(32, None, None))] + query_states_20 = None + neg_40 = -x2_40 + x2_40 = None + cat_41 = torch.cat((neg_40, x1_40), dim=-1) + neg_40 = x1_40 = None + mul_185 = cat_41 * sin_23 + cat_41 = None + q_embed_20 = mul_184 + mul_185 + mul_184 = mul_185 = None + mul_186 = key_states_20 * cos_23 + cos_23 = None + x1_41 = key_states_20[(Ellipsis, slice(None, 32, None))] + x2_41 = key_states_20[(Ellipsis, slice(32, None, None))] + key_states_20 = None + neg_41 = -x2_41 + x2_41 = None + cat_42 = torch.cat((neg_41, x1_41), dim=-1) + neg_41 = x1_41 = None + mul_187 = cat_42 * sin_23 + cat_42 = sin_23 = None + k_embed_20 = mul_186 + mul_187 + mul_186 = mul_187 = None + attention_mask_21 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_80 = torch._C._nn.scaled_dot_product_attention( + q_embed_20, + k_embed_20, + value_states_20, + attn_mask=attention_mask_21, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_20 = k_embed_20 = value_states_20 = attention_mask_21 = None + transpose_84 = attn_output_80.transpose(1, 2) + attn_output_80 = None + attn_output_81 = transpose_84.contiguous() + transpose_84 = None + reshape_20 = attn_output_81.reshape(1, 19, -1) + attn_output_81 = None + attn_output_82 = reshape_20.contiguous() + reshape_20 = None + attn_output_83 = torch._C._nn.linear( + attn_output_82, + l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_82 = l_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_163 = hidden_states_159 + attn_output_83 + hidden_states_159 = attn_output_83 = None + hidden_states_164 = hidden_states_163.to(torch.float32) + pow_42 = hidden_states_164.pow(2) + variance_41 = pow_42.mean(-1, keepdim=True) + pow_42 = None + add_124 = variance_41 + 1e-06 + variance_41 = None + rsqrt_41 = torch.rsqrt(add_124) + add_124 = None + hidden_states_165 = hidden_states_164 * rsqrt_41 + hidden_states_164 = rsqrt_41 = None + to_89 = hidden_states_165.to(torch.float32) + hidden_states_165 = None + hidden_states_166 = ( + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + * to_89 + ) + l_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = ( + to_89 + ) = None + linear_144 = torch._C._nn.linear( + hidden_states_166, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_20 = torch.nn.functional.silu(linear_144, inplace=False) + linear_144 = None + linear_145 = torch._C._nn.linear( + hidden_states_166, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_166 = l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_190 = silu_20 * linear_145 + silu_20 = linear_145 = None + down_proj_20 = torch._C._nn.linear( + mul_190, + l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_190 = l_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_167 = hidden_states_163 + down_proj_20 + hidden_states_163 = down_proj_20 = None + hidden_states_168 = hidden_states_167.to(torch.float32) + pow_43 = hidden_states_168.pow(2) + variance_42 = pow_43.mean(-1, keepdim=True) + pow_43 = None + add_126 = variance_42 + 1e-06 + variance_42 = None + rsqrt_42 = torch.rsqrt(add_126) + add_126 = None + hidden_states_169 = hidden_states_168 * rsqrt_42 + hidden_states_168 = rsqrt_42 = None + to_91 = hidden_states_169.to(torch.float32) + hidden_states_169 = None + hidden_states_170 = ( + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + * to_91 + ) + l_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = ( + to_91 + ) = None + linear_147 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_63 = linear_147.view((1, 19, -1, 64)) + linear_147 = None + query_states_21 = view_63.transpose(1, 2) + view_63 = None + linear_148 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_64 = linear_148.view((1, 19, -1, 64)) + linear_148 = None + key_states_21 = view_64.transpose(1, 2) + view_64 = None + linear_149 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_170 = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_65 = linear_149.view((1, 19, -1, 64)) + linear_149 = None + value_states_21 = view_65.transpose(1, 2) + view_65 = None + cos_24 = cos_2.unsqueeze(1) + sin_24 = sin_2.unsqueeze(1) + mul_193 = query_states_21 * cos_24 + x1_42 = query_states_21[(Ellipsis, slice(None, 32, None))] + x2_42 = query_states_21[(Ellipsis, slice(32, None, None))] + query_states_21 = None + neg_42 = -x2_42 + x2_42 = None + cat_43 = torch.cat((neg_42, x1_42), dim=-1) + neg_42 = x1_42 = None + mul_194 = cat_43 * sin_24 + cat_43 = None + q_embed_21 = mul_193 + mul_194 + mul_193 = mul_194 = None + mul_195 = key_states_21 * cos_24 + cos_24 = None + x1_43 = key_states_21[(Ellipsis, slice(None, 32, None))] + x2_43 = key_states_21[(Ellipsis, slice(32, None, None))] + key_states_21 = None + neg_43 = -x2_43 + x2_43 = None + cat_44 = torch.cat((neg_43, x1_43), dim=-1) + neg_43 = x1_43 = None + mul_196 = cat_44 * sin_24 + cat_44 = sin_24 = None + k_embed_21 = mul_195 + mul_196 + mul_195 = mul_196 = None + attention_mask_22 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_84 = torch._C._nn.scaled_dot_product_attention( + q_embed_21, + k_embed_21, + value_states_21, + attn_mask=attention_mask_22, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_21 = k_embed_21 = value_states_21 = attention_mask_22 = None + transpose_88 = attn_output_84.transpose(1, 2) + attn_output_84 = None + attn_output_85 = transpose_88.contiguous() + transpose_88 = None + reshape_21 = attn_output_85.reshape(1, 19, -1) + attn_output_85 = None + attn_output_86 = reshape_21.contiguous() + reshape_21 = None + attn_output_87 = torch._C._nn.linear( + attn_output_86, + l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_86 = l_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_171 = hidden_states_167 + attn_output_87 + hidden_states_167 = attn_output_87 = None + hidden_states_172 = hidden_states_171.to(torch.float32) + pow_44 = hidden_states_172.pow(2) + variance_43 = pow_44.mean(-1, keepdim=True) + pow_44 = None + add_130 = variance_43 + 1e-06 + variance_43 = None + rsqrt_43 = torch.rsqrt(add_130) + add_130 = None + hidden_states_173 = hidden_states_172 * rsqrt_43 + hidden_states_172 = rsqrt_43 = None + to_93 = hidden_states_173.to(torch.float32) + hidden_states_173 = None + hidden_states_174 = ( + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + * to_93 + ) + l_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = ( + to_93 + ) = None + linear_151 = torch._C._nn.linear( + hidden_states_174, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_21 = torch.nn.functional.silu(linear_151, inplace=False) + linear_151 = None + linear_152 = torch._C._nn.linear( + hidden_states_174, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_174 = l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_199 = silu_21 * linear_152 + silu_21 = linear_152 = None + down_proj_21 = torch._C._nn.linear( + mul_199, + l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_199 = l_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_175 = hidden_states_171 + down_proj_21 + hidden_states_171 = down_proj_21 = None + hidden_states_176 = hidden_states_175.to(torch.float32) + pow_45 = hidden_states_176.pow(2) + variance_44 = pow_45.mean(-1, keepdim=True) + pow_45 = None + add_132 = variance_44 + 1e-06 + variance_44 = None + rsqrt_44 = torch.rsqrt(add_132) + add_132 = None + hidden_states_177 = hidden_states_176 * rsqrt_44 + hidden_states_176 = rsqrt_44 = None + to_95 = hidden_states_177.to(torch.float32) + hidden_states_177 = None + hidden_states_178 = ( + l_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + * to_95 + ) + l_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = ( + to_95 + ) = None + linear_154 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_66 = linear_154.view((1, 19, -1, 64)) + linear_154 = None + query_states_22 = view_66.transpose(1, 2) + view_66 = None + linear_155 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_67 = linear_155.view((1, 19, -1, 64)) + linear_155 = None + key_states_22 = view_67.transpose(1, 2) + view_67 = None + linear_156 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_178 = l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_68 = linear_156.view((1, 19, -1, 64)) + linear_156 = None + value_states_22 = view_68.transpose(1, 2) + view_68 = None + cos_25 = cos_2.unsqueeze(1) + sin_25 = sin_2.unsqueeze(1) + mul_202 = query_states_22 * cos_25 + x1_44 = query_states_22[(Ellipsis, slice(None, 32, None))] + x2_44 = query_states_22[(Ellipsis, slice(32, None, None))] + query_states_22 = None + neg_44 = -x2_44 + x2_44 = None + cat_45 = torch.cat((neg_44, x1_44), dim=-1) + neg_44 = x1_44 = None + mul_203 = cat_45 * sin_25 + cat_45 = None + q_embed_22 = mul_202 + mul_203 + mul_202 = mul_203 = None + mul_204 = key_states_22 * cos_25 + cos_25 = None + x1_45 = key_states_22[(Ellipsis, slice(None, 32, None))] + x2_45 = key_states_22[(Ellipsis, slice(32, None, None))] + key_states_22 = None + neg_45 = -x2_45 + x2_45 = None + cat_46 = torch.cat((neg_45, x1_45), dim=-1) + neg_45 = x1_45 = None + mul_205 = cat_46 * sin_25 + cat_46 = sin_25 = None + k_embed_22 = mul_204 + mul_205 + mul_204 = mul_205 = None + attention_mask_23 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + attn_output_88 = torch._C._nn.scaled_dot_product_attention( + q_embed_22, + k_embed_22, + value_states_22, + attn_mask=attention_mask_23, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_22 = k_embed_22 = value_states_22 = attention_mask_23 = None + transpose_92 = attn_output_88.transpose(1, 2) + attn_output_88 = None + attn_output_89 = transpose_92.contiguous() + transpose_92 = None + reshape_22 = attn_output_89.reshape(1, 19, -1) + attn_output_89 = None + attn_output_90 = reshape_22.contiguous() + reshape_22 = None + attn_output_91 = torch._C._nn.linear( + attn_output_90, + l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_90 = l_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_179 = hidden_states_175 + attn_output_91 + hidden_states_175 = attn_output_91 = None + hidden_states_180 = hidden_states_179.to(torch.float32) + pow_46 = hidden_states_180.pow(2) + variance_45 = pow_46.mean(-1, keepdim=True) + pow_46 = None + add_136 = variance_45 + 1e-06 + variance_45 = None + rsqrt_45 = torch.rsqrt(add_136) + add_136 = None + hidden_states_181 = hidden_states_180 * rsqrt_45 + hidden_states_180 = rsqrt_45 = None + to_97 = hidden_states_181.to(torch.float32) + hidden_states_181 = None + hidden_states_182 = ( + l_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + * to_97 + ) + l_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = ( + to_97 + ) = None + linear_158 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_22 = torch.nn.functional.silu(linear_158, inplace=False) + linear_158 = None + linear_159 = torch._C._nn.linear( + hidden_states_182, + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_182 = l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_208 = silu_22 * linear_159 + silu_22 = linear_159 = None + down_proj_22 = torch._C._nn.linear( + mul_208, + l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_208 = l_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_183 = hidden_states_179 + down_proj_22 + hidden_states_179 = down_proj_22 = None + hidden_states_184 = hidden_states_183.to(torch.float32) + pow_47 = hidden_states_184.pow(2) + variance_46 = pow_47.mean(-1, keepdim=True) + pow_47 = None + add_138 = variance_46 + 1e-06 + variance_46 = None + rsqrt_46 = torch.rsqrt(add_138) + add_138 = None + hidden_states_185 = hidden_states_184 * rsqrt_46 + hidden_states_184 = rsqrt_46 = None + to_99 = hidden_states_185.to(torch.float32) + hidden_states_185 = None + hidden_states_186 = ( + l_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + * to_99 + ) + l_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = ( + to_99 + ) = None + linear_161 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_ = (None) + view_69 = linear_161.view((1, 19, -1, 64)) + linear_161 = None + query_states_23 = view_69.transpose(1, 2) + view_69 = None + linear_162 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_, + ) + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_ = (None) + view_70 = linear_162.view((1, 19, -1, 64)) + linear_162 = None + key_states_23 = view_70.transpose(1, 2) + view_70 = None + linear_163 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_, + ) + hidden_states_186 = l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_ = l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_ = (None) + view_71 = linear_163.view((1, 19, -1, 64)) + linear_163 = None + value_states_23 = view_71.transpose(1, 2) + view_71 = None + cos_26 = cos_2.unsqueeze(1) + cos_2 = None + sin_26 = sin_2.unsqueeze(1) + sin_2 = None + mul_211 = query_states_23 * cos_26 + x1_46 = query_states_23[(Ellipsis, slice(None, 32, None))] + x2_46 = query_states_23[(Ellipsis, slice(32, None, None))] + query_states_23 = None + neg_46 = -x2_46 + x2_46 = None + cat_47 = torch.cat((neg_46, x1_46), dim=-1) + neg_46 = x1_46 = None + mul_212 = cat_47 * sin_26 + cat_47 = None + q_embed_23 = mul_211 + mul_212 + mul_211 = mul_212 = None + mul_213 = key_states_23 * cos_26 + cos_26 = None + x1_47 = key_states_23[(Ellipsis, slice(None, 32, None))] + x2_47 = key_states_23[(Ellipsis, slice(32, None, None))] + key_states_23 = None + neg_47 = -x2_47 + x2_47 = None + cat_48 = torch.cat((neg_47, x1_47), dim=-1) + neg_47 = x1_47 = None + mul_214 = cat_48 * sin_26 + cat_48 = sin_26 = None + k_embed_23 = mul_213 + mul_214 + mul_213 = mul_214 = None + attention_mask_24 = causal_mask[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 19, None), + ) + ] + causal_mask = None + attn_output_92 = torch._C._nn.scaled_dot_product_attention( + q_embed_23, + k_embed_23, + value_states_23, + attn_mask=attention_mask_24, + dropout_p=0.0, + scale=0.125, + is_causal=False, + ) + q_embed_23 = k_embed_23 = value_states_23 = attention_mask_24 = None + transpose_96 = attn_output_92.transpose(1, 2) + attn_output_92 = None + attn_output_93 = transpose_96.contiguous() + transpose_96 = None + reshape_23 = attn_output_93.reshape(1, 19, -1) + attn_output_93 = None + attn_output_94 = reshape_23.contiguous() + reshape_23 = None + attn_output_95 = torch._C._nn.linear( + attn_output_94, + l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, + None, + ) + attn_output_94 = l_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = (None) + hidden_states_187 = hidden_states_183 + attn_output_95 + hidden_states_183 = attn_output_95 = None + hidden_states_188 = hidden_states_187.to(torch.float32) + pow_48 = hidden_states_188.pow(2) + variance_47 = pow_48.mean(-1, keepdim=True) + pow_48 = None + add_142 = variance_47 + 1e-06 + variance_47 = None + rsqrt_47 = torch.rsqrt(add_142) + add_142 = None + hidden_states_189 = hidden_states_188 * rsqrt_47 + hidden_states_188 = rsqrt_47 = None + to_101 = hidden_states_189.to(torch.float32) + hidden_states_189 = None + hidden_states_190 = ( + l_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + * to_101 + ) + l_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = ( + to_101 + ) = None + linear_165 = torch._C._nn.linear( + hidden_states_190, + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_, + None, + ) + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_ = ( + None + ) + silu_23 = torch.nn.functional.silu(linear_165, inplace=False) + linear_165 = None + linear_166 = torch._C._nn.linear( + hidden_states_190, + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_, + None, + ) + hidden_states_190 = l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_ = (None) + mul_217 = silu_23 * linear_166 + silu_23 = linear_166 = None + down_proj_23 = torch._C._nn.linear( + mul_217, + l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, + None, + ) + mul_217 = l_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = (None) + hidden_states_191 = hidden_states_187 + down_proj_23 + hidden_states_187 = down_proj_23 = None + hidden_states_192 = hidden_states_191.to(torch.float32) + hidden_states_191 = None + pow_49 = hidden_states_192.pow(2) + variance_48 = pow_49.mean(-1, keepdim=True) + pow_49 = None + add_144 = variance_48 + 1e-06 + variance_48 = None + rsqrt_48 = torch.rsqrt(add_144) + add_144 = None + hidden_states_193 = hidden_states_192 * rsqrt_48 + hidden_states_192 = rsqrt_48 = None + to_103 = hidden_states_193.to(torch.float32) + hidden_states_193 = None + hidden_states_194 = ( + l_self_modules_model_modules_norm_parameters_weight_ * to_103 + ) + l_self_modules_model_modules_norm_parameters_weight_ = to_103 = None + getitem_122 = hidden_states_194[ + (slice(None, None, None), slice(0, None, None), slice(None, None, None)) + ] + hidden_states_194 = None + logits = torch._C._nn.linear( + getitem_122, + l_self_modules_model_modules_embed_tokens_parameters_weight_, + None, + ) + getitem_122 = ( + l_self_modules_model_modules_embed_tokens_parameters_weight_ + ) = None + return (logits,) diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/weight_meta.py b/samples/transformers-auto-model/Qwen1.5-0.5B/weight_meta.py new file mode 100644 index 000000000..2e5ab88e5 --- /dev/null +++ b/samples/transformers-auto-model/Qwen1.5-0.5B/weight_meta.py @@ -0,0 +1,2981 @@ +class Program_weight_tensor_meta_L_kwargs_input_ids_: + name = "L_kwargs_input_ids_" + shape = [1, 19] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 9707, + 11, + 847, + 829, + 374, + 14261, + 13, + 358, + 1079, + 6832, + 911, + 3460, + 4128, + 4119, + 323, + 862, + 77235, + 13, + 220, + ] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_embed_tokens_parameters_weight_: + name = "L_self_modules_model_modules_embed_tokens_parameters_weight_" + shape = [151936, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.018 + data = None + + +class Program_weight_tensor_meta_L_kwargs_attention_mask_: + name = "L_kwargs_attention_mask_" + shape = [1, 19] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_rotary_emb_buffers_inv_freq_: + name = "L_self_modules_model_modules_rotary_emb_buffers_inv_freq_" + shape = [32] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.089 + std = 0.218 + data = [ + 1.000000, + 0.649382, + 0.421697, + 0.273842, + 0.177828, + 0.115478, + 0.074989, + 0.048697, + 0.031623, + 0.020535, + 0.013335, + 0.008660, + 0.005623, + 0.003652, + 0.002371, + 0.001540, + 0.001000, + 0.000649, + 0.000422, + 0.000274, + 0.000178, + 0.000115, + 0.000075, + 0.000049, + 0.000032, + 0.000021, + 0.000013, + 0.000009, + 0.000006, + 0.000004, + 0.000002, + 0.000002, + ] + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.049 + std = 0.038 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.079 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.105 + std = 4.635 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.075 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.085 + std = 7.672 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.016 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.017 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.015 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.429 + std = 0.088 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.616 + std = 0.163 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.040 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 1.033 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.042 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.020 + std = 3.080 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.015 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.003 + std = 0.075 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.014 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.553 + std = 0.137 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.022 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.022 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.849 + std = 0.123 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.035 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.045 + std = 1.223 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.036 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.030 + std = 2.979 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.015 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.055 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.014 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.578 + std = 0.125 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.882 + std = 0.139 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.033 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.024 + std = 1.150 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.035 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.136 + std = 2.621 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.016 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.056 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.015 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.611 + std = 0.104 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.767 + std = 0.110 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.035 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.010 + std = 1.133 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.037 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.005 + std = 3.850 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.016 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 0.054 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.016 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.681 + std = 0.130 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.866 + std = 0.112 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.034 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.003 + std = 1.117 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.034 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.008 + std = 2.713 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.017 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 0.083 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.016 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.736 + std = 0.234 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.908 + std = 0.142 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.032 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.034 + std = 1.283 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.023 + std = 1.433 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.003 + std = 0.109 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.735 + std = 0.161 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.039 + std = 0.199 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.051 + std = 1.293 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.014 + std = 0.800 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 0.066 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.018 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.753 + std = 0.178 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.913 + std = 0.200 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.032 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.113 + std = 1.260 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.032 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.137 + std = 2.284 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.002 + std = 0.063 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.754 + std = 0.190 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.890 + std = 0.260 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.033 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.030 + std = 1.269 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.033 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.022 + std = 1.331 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 0.123 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.722 + std = 0.206 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.132 + std = 0.354 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.048 + std = 1.085 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.023 + std = 0.679 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.091 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.018 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.704 + std = 0.201 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.100 + std = 0.347 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.029 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.016 + std = 1.116 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.036 + std = 1.326 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.084 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.709 + std = 0.175 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.078 + std = 0.333 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.002 + std = 1.149 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.013 + std = 0.783 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.117 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.725 + std = 0.193 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.054 + std = 0.323 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.034 + std = 1.212 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.010 + std = 0.865 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.093 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.745 + std = 0.189 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.100 + std = 0.314 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.029 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.039 + std = 1.225 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.029 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.743 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.021 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.069 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.751 + std = 0.141 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.108 + std = 0.270 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.029 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.041 + std = 1.242 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.047 + std = 0.761 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.067 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.022 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.784 + std = 0.113 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.155 + std = 0.232 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.033 + std = 1.178 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.045 + std = 0.730 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.003 + std = 0.163 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.854 + std = 0.101 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.173 + std = 0.139 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.029 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 1.199 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.011 + std = 0.790 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.002 + std = 0.160 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.917 + std = 0.082 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.206 + std = 0.195 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 1.163 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.031 + std = 0.701 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.124 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.999 + std = 0.069 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.257 + std = 0.206 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.029 + std = 1.134 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.108 + std = 2.506 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.004 + std = 0.207 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.062 + std = 0.064 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.350 + std = 0.350 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.007 + std = 1.085 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.021 + std = 0.732 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.002 + std = 0.154 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.111 + std = 0.067 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.391 + std = 0.254 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.030 + std = 1.161 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.024 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 0.830 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.002 + std = 0.217 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.178 + std = 0.082 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.553 + std = 0.210 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.035 + std = 1.167 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.022 + std = 4.438 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.030 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.023 + std = 0.453 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.294 + std = 0.251 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.027 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_input_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.543 + std = 0.221 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_q_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.059 + std = 0.961 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_k_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.019 + std = 1.666 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.028 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_v_proj_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.019 + std = 0.346 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.467 + std = 0.267 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_gate_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.025 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_up_proj_parameters_weight_" + shape = [2816, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.026 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: + name = "L_self_modules_model_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_" + shape = [1024, 2816] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.023 + data = None + + +class Program_weight_tensor_meta_L_self_modules_model_modules_norm_parameters_weight_: + name = "L_self_modules_model_modules_norm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 5.330 + std = 0.430 + data = None diff --git a/tools/ci/check_validate.sh b/tools/ci/check_validate.sh index d32e4f636..f16740916 100644 --- a/tools/ci/check_validate.sh +++ b/tools/ci/check_validate.sh @@ -38,8 +38,8 @@ function prepare_torch_env() { env http_proxy="" https_proxy="" pip install -U pip > /dev/null [ $? -ne 0 ] && LOG "[FATAL] Update pip failed!" && exit -1 # install torch - pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu118 > /dev/null - [ $? -ne 0 ] && LOG "[FATAL] Install torch2.6.0 failed!" && exit -1 + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 > /dev/null + [ $? -ne 0 ] && LOG "[FATAL] Install torch2.9.0 failed!" && exit -1 else python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py LOG "[INFO] This pull request doesn't change any torch samples, skip the CI."