1+ // RUN: mlir-opt %s | mlir-opt -canonicalize -cse | FileCheck %s
2+
3+ gpu.module @main_kernel {
4+
5+ // CHECK-LABEL: @main_kernel(
6+ // CHECK-SAME: %[[arg0:.*]]: !nvgpu.tensormap.descriptor
7+ gpu.func @main_kernel (%arg0: !nvgpu.tensormap.descriptor <
8+ tensor = memref <128 x32 xf32 , 3 >, swizzle = none , l2promo = none ,
9+ oob = zero , interleave = none >) kernel attributes
10+ { gpu.known_block_size = array <i32 : 128 , 1 , 1 >,
11+ gpu.known_grid_size = array <i32 : 1 , 1 , 1 >
12+ }
13+ {
14+ // CHECK: %[[c0:.+]] = arith.constant 0 : index
15+ // CHECK: %[[S0:.+]] = gpu.thread_id x
16+ // CHECK: %[[S1:.+]] = arith.cmpi eq, %[[S0]], %[[c0]] : index
17+ // CHECK: %[[S2:.+]] = gpu.dynamic_shared_memory : memref<?xi8, #gpu.address_space<workgroup>>
18+ // CHECK: %[[S3:.+]] = memref.view %[[S2]][%[[c0]]][] : memref<?xi8, #gpu.address_space<workgroup>> to memref<128x32xf32, #gpu.address_space<workgroup>>
19+ // CHECK: nvgpu.tma.async.store %[[S3]] to %[[arg0]][%[[c0]], %[[c0]]], predicate = %[[S1]] : memref<128x32xf32, #gpu.address_space<workgroup>> -> <tensor = memref<128x32xf32, 3>, swizzle = none, l2promo = none, oob = zero, interleave = none>
20+ %c0 = arith.constant 0 : index
21+ %0 = gpu.thread_id x
22+ %1 = arith.cmpi eq , %0 , %c0 : index
23+ %2 = gpu.dynamic_shared_memory : memref <?xi8 , #gpu.address_space <workgroup >>
24+ %view = memref.view %2 [%c0 ][] : memref <?xi8 , #gpu.address_space <workgroup >> to memref <128 x32 xf32 , #gpu.address_space <workgroup >>
25+ nvgpu.tma.async.store %view to %arg0 [%c0 , %c0 ], predicate = %1 : memref <128 x32 xf32 , #gpu.address_space <workgroup >> -> <tensor = memref <128 x32 xf32 , 3 >, swizzle = none , l2promo = none , oob = zero , interleave = none >
26+ nvvm.cp.async.bulk.commit.group
27+ nvvm.cp.async.bulk.wait_group 0
28+ gpu.return
29+ }
30+ }
0 commit comments