Add unit test for lowering the tt.load of tensor of pointers with structured memory.

chengjunlu · chengjunlu · commit 28e9720f71af · 2025-04-11T10:19:38.000Z
Fix segfault in LoadOpToBlockIOConversion

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -1,3 +1,6 @@
+import itertools
+
+import numpy as np
 import pytest
 import torch
 import pathlib
@@ -7,6 +10,27 @@
 
 
 @pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [128, 16], [128, 8], [64, 64], [64, 32], [32, 32]])
+class DpasLayout:
+
+    def __init__(self, repeatCount, systolic_depth, execution_size, ops_per_chan, threads_per_warp, warps_per_cta,
+                 rep_cluster):
+        self.repeatCount = repeatCount
+        self.systolic_depth = systolic_depth
+        self.execution_size = execution_size
+        self.ops_per_chan = ops_per_chan
+        self.threads_per_warp = threads_per_warp
+        self.warps_per_cta = warps_per_cta
+        self.rep_cluster = rep_cluster
+
+    def __str__(self):
+        return f"#triton_intel_gpu.dpas<{{repeatCount={self.repeatCount}, systolicDepth={self.systolic_depth}, executionSize = {self.execution_size}, opsPerChan = {self.ops_per_chan}, threadsPerWarp = {self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, repCluster={self.rep_cluster}}}>"
+
+
+def warps_per_cta(layout):
+    return layout.warps_per_cta
+
+
+@pytest.mark.parametrize("M, N", [[256, 64], [256, 32], [128, 32], [64, 64], [64, 32], [32, 32]])
 @pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
 @pytest.mark.parametrize("transpose", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
@@ -79,3 +103,107 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
     kernel[(1, 1, 1)](a, x, b, y)
     #import pdb; pdb.set_trace()
     assert torch.equal(a, x) and torch.equal(b.T if transpose else b, y)
+
+
+layouts = [
+    # Layout for Xe2 and Xe2+
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=16,
+               warps_per_cta=[1, 4], rep_cluster=[1, 2]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=16,
+               warps_per_cta=[8, 4], rep_cluster=[4, 2]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=16,
+               warps_per_cta=[8, 4], rep_cluster=[1, 1]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=32,
+               warps_per_cta=[1, 4], rep_cluster=[1, 2]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=32,
+               warps_per_cta=[8, 4], rep_cluster=[4, 2]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=32,
+               warps_per_cta=[8, 4], rep_cluster=[1, 1]),
+    # Layout for Xe
+]
+
+
+@pytest.mark.parametrize("M, N", [[M, N] for M, N in itertools.product([32, 64, 128, 256], [32, 64, 128, 256])])
+@pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
+@pytest.mark.parametrize("layout", layouts)
+@pytest.mark.skipif(not is_xpu(), reason="Block load tests are specific to the XPU backend")
+def test_tensor_pointer_block_load(M, N, dtype_str, layout, device, tmp_path: pathlib.Path):
+
+    warps = warps_per_cta(layout)
+    num_warps = int(np.prod(warps))
+    threads_per_warp = layout.threads_per_warp
+    ops_per_chan = layout.ops_per_chan
+    A_width = 1 if ops_per_chan == 1 else ops_per_chan // 2
+    B_width = ops_per_chan
+
+    ty = {"float32": "f32", "float16": "f16", "int8": "i8"}[dtype_str]
+
+    support_block_io = torch.xpu.get_device_capability()['has_subgroup_2d_block_io']
+
+    ir = f"""
+    #mma = {layout}
+    #dot_a = #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>
+    #dot_b = #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>
+    module attributes {{triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, {"triton_intel_gpu.support_sg_2d_block," if support_block_io else ""} triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = {num_warps} : i32, ttg.target = "xpu", "ttg.threads-per-warp" = {threads_per_warp} : i32}} {{
+        tt.func public @tensor_pointer_block_load(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg6: i32 {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg3: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg7: i32 {{tt.divisibility = 16 : i32}}) attributes {{noinline = false}} {{
+            // A matrix
+            %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>>
+            %2 = tt.expand_dims %1 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>> -> tensor<{M}x1xi32, #dot_a>
+            %3 = tt.splat %arg6 : i32 -> tensor<{M}x1xi32, #dot_a>
+            %4 = arith.muli %2, %3 : tensor<{M}x1xi32, #dot_a>
+            %5 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>>
+            %6 = tt.expand_dims %5 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>> -> tensor<1x{N}xi32, #dot_a>
+            %7 = tt.broadcast %4 : tensor<{M}x1xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a>
+            %8 = tt.broadcast %6 : tensor<1x{N}xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a>
+            %9 = arith.addi %7, %8 : tensor<{M}x{N}xi32, #dot_a>
+
+            %10 = tt.splat %arg0 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
+            %11 = tt.addptr %10, %9 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a>
+            %12 = tt.load %11 {{triton_intel_gpu.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
+            %13 = tt.splat %arg1 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
+            %14 = tt.addptr %13, %9 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a>
+            tt.store %14, %12 {{boundaryCheck = array<i32: 0, 1>}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
+
+            // B matrix
+            %22 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>>
+            %44 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>>
+            %46 = tt.expand_dims %44 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>> -> tensor<{M}x1xi32, #dot_b>
+            %48 = tt.splat %arg7 : i32 -> tensor<{M}x1xi32, #dot_b>
+            %49 = arith.muli %46, %48 : tensor<{M}x1xi32, #dot_b>
+            %50 = tt.expand_dims %22 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>> -> tensor<1x{N}xi32, #dot_b>
+            %51 = tt.broadcast %49 : tensor<{M}x1xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b>
+            %52 = tt.broadcast %50 : tensor<1x{N}xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b>
+            %53 = arith.addi %51, %52 : tensor<{M}x{N}xi32, #dot_b>
+
+            %54 = tt.splat %arg2 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
+            %55 = tt.addptr %54, %53 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b>
+            %56 = tt.load %55 {{triton_intel_gpu.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
+            %57 = tt.splat %arg3 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
+            %58 = tt.addptr %57, %53 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b>
+            tt.store %58, %56 {{boundaryCheck = array<i32: 0, 1>}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
+
+            tt.return
+        }}
+    }}
+    """
+
+    torch_dtype = getattr(torch, dtype_str)
+    if torch_dtype.is_floating_point:
+        a = torch.randn((M, N), dtype=torch_dtype, device=device)
+    else:
+        a = torch.randint(low=-127, high=128, size=(M, N), dtype=torch_dtype, device=device)
+
+    x = torch.empty_like(a)
+    y = torch.empty_like(a)
+
+    temp_file = tmp_path / "test_tensor_pointer_block_load.ttgir"
+    temp_file.write_text(ir)
+    kernel = triton.compile(str(temp_file))
+
+    if support_block_io:
+        # assert '2d block io' in kernel.asm['llir']
+        pass
+
+    kernel[(1, 1, 1)](a, x, a.stride(0), a, y, a.stride(0))
+
+    assert torch.equal(a, x) and torch.equal(a, y)
diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
@@ -110,6 +110,39 @@ loadCacheControlToCacheControls(Builder &builder,
   return builder.getAttr<TritonGEN::DecorationCacheControlAttr>(decorations);
 }
 
+[[maybe_unused]] static bool
+isOCLBuiltinAvailable(TritonGEN::Matrix2DBlockLoadOp op) {
+  VectorType resTy = op.getRes().getType();
+  unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
+  bool needsResElemSizeEqualTo32 =
+      op.getElemSizeInBits() == 32 || op.getVnniTransform();
+  assert((!needsResElemSizeEqualTo32 || resElemTySize == 32) &&
+         "Expecting 32-bit element type");
+  if (!needsResElemSizeEqualTo32 && resElemTySize != 16)
+    return false;
+
+  if (op.getVnniTransform())
+    return true;
+
+  if (op.getTranspose() && op.getTileHeight() != 16)
+    return false;
+
+  uint32_t tileWidth = op.getTileWidth();
+  uint32_t tileHeight = op.getTileHeight();
+  switch (op.getElemSizeInBits()) {
+  case 8:
+    return (tileWidth == 32);
+  case 16:
+    return (tileWidth == 16) && (tileHeight != 32);
+  case 32:
+    return (tileWidth == 8 || tileWidth == 16) && (tileHeight != 32);
+  default:
+    llvm_unreachable("unexpected element size");
+  }
+
+  return false;
+}
+
 [[maybe_unused]] static Value
 createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
                         ConversionPatternRewriter &rewriter) {
@@ -119,12 +152,20 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
   Value ptr = op.getPtr();
-  Value baseWidth = op.getBaseWidth();
   Value baseHeight = op.getBaseHeight();
   Value basePitch = op.getBasePitch();
-  Value x = op.getX();
   Value y = op.getY();
 
+  // compensate the non-64 byte aligned base.
+  Value offset =
+      b.trunc(i32_ty, b.and_(b.ptrtoint(i64_ty, ptr), b.i64_val(0x3f)));
+  // In number of bytes.
+  Value baseWidth = b.add(op.getBaseWidth(), offset);
+  // In number of scalar elements.
+  Value offsetX =
+      b.add(op.getX(),
+            b.lshr(offset, b.i32_val(std::log2(op.getElemSizeInBits() / 8))));
+
   std::string funcName =
       "llvm.genx.GenISA.LSC2DBlockRead." + getGenISATypeMangling(resType);
   IntegerType int1Ty = rewriter.getIntegerType(1);
@@ -139,7 +180,7 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
                              baseWidth.getType(),
                              baseHeight.getType(),
                              basePitch.getType(),
-                             x.getType(),
+                             offsetX.getType(),
                              y.getType(),
                              int32Ty,
                              int32Ty,
@@ -153,7 +194,7 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
                           b.sub(baseWidth, one),
                           b.sub(baseHeight, one),
                           b.sub(basePitch, one),
-                          x,
+                          offsetX,
                           y,
                           b.i32_val(op.getElemSizeInBits()),
                           b.i32_val(op.getTileWidth()),
@@ -421,8 +462,9 @@ struct TritonMatrix2DBlockLoadLowering
   LogicalResult
   matchAndRewrite(TritonGEN::Matrix2DBlockLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (op.getElemSizeInBits() == 8 && op.getTileWidth() == 16 &&
-        op.getVBlocks() != 4 && !op.getVnniTransform()) {
+    if (!isOCLBuiltinAvailable(op) ||
+        op.getElemSizeInBits() == 8 && op.getTileWidth() == 16 &&
+            op.getVBlocks() != 4 && !op.getVnniTransform()) {
       // TODO: add ocl builtin/spirv intrinsics for 8b 16 column 1 vBlock & 2
       // vBlock reads
       rewriter.replaceOp(op, createGenISA2DBlockRead(op, rewriter));
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -494,6 +494,11 @@ struct LoadOpToBlockIOConversion
   LogicalResult
   matchAndRewrite(triton::LoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const final {
+    ModuleOp mod = op->getParentOfType<ModuleOp>();
+    if (!mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
+                          getSupportSG2DBlockAttrName()))
+      return failure();
+
     Attribute blockIOAttr =
         op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
     if (!blockIOAttr)
@@ -727,6 +732,10 @@ struct LoadOpToBlockIOConversion
         if (otherElems.size())
           others[offset] = otherElems[i];
       }
+      // ptrs[{0, 0}] and ptrs[{1, 0}] are currently used to calculate the
+      // pitch.
+      if (ptrs.count({0, 0}) < 1 || ptrs.count({1, 0}) < 1)
+        return failure();
     }
 
     unsigned numOperandsPer2DLoadM, numOperandsPer2DloadN;
@@ -769,6 +778,8 @@ struct LoadOpToBlockIOConversion
     // PVC 2D load supports 64 bytes per row at most. Load multiple dot operands
     // by enlarging the vBlocks.
     unsigned totalBytesPerRowPerDPASOp = tileWidth * elemSizeInBits / 8;
+    if (totalBytesPerRowPerDPASOp > 64)
+      return failure();
     numOperandsPer2DloadN =
         std::min(numOperandsPer2DloadN, 64 / totalBytesPerRowPerDPASOp);
 
@@ -815,6 +826,8 @@ struct LoadOpToBlockIOConversion
     StringAttr kWarp = str_attr("warp");
     StringAttr kBlock = str_attr("block");
 
+    const unsigned originalElemBits = elemSizeInBits;
+
     ValueTable loadVals;
     for (int inner = 0; inner < numRepInner;
          inner += numOperandsInnerDimPerLoad) {
@@ -884,9 +897,9 @@ struct LoadOpToBlockIOConversion
                     /*tile_height*/ tileHeight,
                     /*v_blocks*/ vBlocks,
                     /*transpose*/ false,
-                    /*vnni_transform*/ opIdx ==
-                            DpasEncodingAttr::OpIdx::OperandB &&
-                        usePackedType);
+                    /*vnni_transform*/
+                    (usePackedType && !isOperandA && !isTransposeRequired &&
+                     originalElemBits != 32));
                 return SmallVector<Value, 1>{load2dOp};
               });
           Value ret = *endBlock.args_begin();