Fix regression issue in flex decoding. (#3999)

chengjunlu · web-flow · commit 935ded30407b · 2025-04-25T13:06:41.000+08:00
The `tt.store` operation with BlockPointer fallbacks to scatter store if
the BLOCK shape or the value layout was not supported by the 2D BLOCK
IO.
The lowering code would transform the BlockPointer to the pointers and
masks.

The scatter store should apply the `and` to the `maskElems` if the
`llMask` doesn't exsits.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir
@@ -364,20 +364,31 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
       %c1_i64 = arith.constant 1 : i64
       %c0_i32 = arith.constant 0 : i32
       %0 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf16, #blocked>>
+      // CHECK: llvm.call spir_funccc @_Z12get_local_idj
       // CHECK-NOT: llvm.icmp "slt"
-      // CHECK-COUNT-32: llvm.store
+      // CHECK: %[[threadID:.*]] = llvm.call spir_funccc @_Z12get_local_idj
+      // CHECK: %[[VAL_583:.*]] = llvm.trunc %[[threadID]] : i64 to i32
+      // CHECK: %[[VAL_584:.*]] = llvm.mlir.constant(16 : i32) : i32
+      // CHECK: %[[VAL_586:.*]] = llvm.udiv %[[VAL_583]], %[[VAL_584]] : i32
+      // CHECK: %[[VAL_587:.*]] = llvm.mlir.constant(3 : i32) : i32
+      // CHECK: %[[VAL_588:.*]] = llvm.and %[[VAL_586]], %[[VAL_587]] : i32
+      // CHECK: %[[threadPred:.*]] = llvm.icmp "eq" %[[VAL_588]], {{.*}} : i32
+      // CHECK-COUNT-32: llvm.cond_br %[[threadPred]]
       tt.store %0, %cst : !tt.ptr<tensor<64x16xf16, #blocked>>
 
       // CHECK-COUNT-16: llvm.icmp "slt"
-      // CHECK-COUNT-32: llvm.store
+      // CHECK: %[[threadPred_0:.*]] = llvm.icmp "eq"
+      // CHECK-COUNT-32: llvm.and %[[threadPred_0]], {{.*}} : i1
       tt.store %0, %cst {boundaryCheck = array<i32: 0>} : !tt.ptr<tensor<64x16xf16, #blocked>>
 
       // CHECK-COUNT-16: llvm.icmp "slt"
-      // CHECK-COUNT-32: llvm.store
+      // CHECK: %[[threadPred_1:.*]] = llvm.icmp "eq"
+      // CHECK-COUNT-32: llvm.and %[[threadPred_1]], {{.*}} : i1
       tt.store %0, %cst {boundaryCheck = array<i32: 1>} : !tt.ptr<tensor<64x16xf16, #blocked>>
 
       // CHECK-COUNT-32: llvm.icmp "slt"
-      // CHECK-COUNT-32: llvm.store
+      // CHECK: %[[threadPred_2:.*]] = llvm.icmp "eq"
+      // CHECK-COUNT-32: llvm.and %[[threadPred_2]], {{.*}} : i1
       tt.store %0, %cst {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x16xf16, #blocked>>
 
       tt.return
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -2232,7 +2232,7 @@ struct StoreOpConversion
       }
 
       Value maskVal = threadPred;
-      if (llMask) {
+      if (maskElems.size() > 0) {
         auto mask = maskElems[vecStart];
         maskVal = maybeAnd(rewriter, loc, threadPred, mask);
       }

Original file line number	Diff line number	Diff line change
`@@ -2232,7 +2232,7 @@ struct StoreOpConversion`
`2232`	`2232`	`}`
`2233`	`2233`
`2234`	`2234`	`Value maskVal = threadPred;`
`2235`		`- if (llMask) {`
	`2235`	`+ if (maskElems.size() > 0) {`
`2236`	`2236`	`auto mask = maskElems[vecStart];`
`2237`	`2237`	`maskVal = maybeAnd(rewriter, loc, threadPred, mask);`
`2238`	`2238`	`}`