[TritonGEN] Use the sub-group-size of the module instead of hard code number of 16 in block load.

chengjunlu · chengjunlu · commit d865d1582827 · 2025-07-22T09:49:26.000Z
Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp b/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp
@@ -13,6 +13,8 @@
 #include "llvm/ADT/STLExtras.h"
 #include <cstdint>
 
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
 using namespace mlir;
 using namespace mlir::triton;
 
@@ -238,7 +240,9 @@ verify2DBlockLoadHWRestriction(TritonGEN::Matrix2DBlockLoadOp op) {
   VectorType resTy = op.getRes().getType();
   unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
   unsigned resSize = resTy.getNumElements() * resElemTySize;
-  constexpr unsigned subgroupSize = 16;
+  unsigned subgroupSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(
+      op->getParentOfType<mlir::ModuleOp>());
+  ;
   unsigned expectedSize = op.getElemSizeInBits() * op.getTileHeight() *
                           op.getTileWidth() * op.getVBlocks() / subgroupSize;
   if (resSize != expectedSize)