intel · etiotto · May 28, 2025 · May 9, 2025 · May 9, 2025 · May 14, 2025
diff --git a/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir b/test/Triton/Intel/TensorDescToBlockPointer/basic.mlir
@@ -1,15 +1,19 @@
 // RUN: triton-opt %s -triton-intel-tdesc-to-block-pointer  | FileCheck %s
 
-module {
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+
+module attributes {"ttg.num-warps" = 4 : i32} {
   tt.func public @test_load(%arg0: !tt.ptr<f32>, %arg1: i32, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c64_i32 = arith.constant 64 : i32
     %c8_i32 = arith.constant 8 : i32
     %0 = arith.extsi %arg2 : i32 to i64
-    %desc = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
-    %load = tt.descriptor_load %desc[%c8_i32, %c64_i32] : !tt.tensordesc<tensor<16x128xf32>> -> tensor<16x128xf32>
+    %desc1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
+    %load1 = tt.descriptor_load %desc1[%c8_i32, %c64_i32] : !tt.tensordesc<tensor<16x128xf32>> -> tensor<16x128xf32>
+    %load2 = tt.descriptor_load %desc1[%c8_i32, %c64_i32] : !tt.tensordesc<tensor<16x128xf32>> -> tensor<16x128xf32, #blocked>
     tt.return
   }
+  // CHECK:      #[[$BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
   // CHECK:      tt.func public @test_load([[PARAM_0:%.+]]: !tt.ptr<f32>, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32) {
   // CHECK-NOT:    tt.make_tensor_descriptor
   // CHECK-NOT:    tt.descriptor_load
@@ -18,8 +22,10 @@ module {
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        [[LOAD:%.+]] = tt.load [[TENSOR_PTR]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR1:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
+  // CHECK:        [[LOAD1:%.+]] = tt.load [[TENSOR_PTR1]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR2:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32, #[[$BLOCKED]]>>
+  // CHECK:        [[LOAD2:%.+]] = tt.load [[TENSOR_PTR2]] : !tt.ptr<tensor<16x128xf32, #[[$BLOCKED]]>>
   // CHECK:        tt.return
   // CHECK:      }
 
@@ -28,9 +34,11 @@ module {
     %c64_i32 = arith.constant 64 : i32
     %c8_i32 = arith.constant 8 : i32
     %cst = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
+    %cst1 = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #blocked>
     %0 = arith.extsi %arg2 : i32 to i64
-    %desc = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
-    tt.descriptor_store %desc[%c8_i32, %c64_i32], %cst : !tt.tensordesc<tensor<16x128xf32>>, tensor<16x128xf32>
+    %desc1 = tt.make_tensor_descriptor %arg0, [%arg1, %arg2], [%0, %c1_i64] : <f32>, <tensor<16x128xf32>>
+    tt.descriptor_store %desc1[%c8_i32, %c64_i32], %cst : !tt.tensordesc<tensor<16x128xf32>>, tensor<16x128xf32>
+    tt.descriptor_store %desc1[%c8_i32, %c64_i32], %cst1 : !tt.tensordesc<tensor<16x128xf32>>, tensor<16x128xf32, #blocked>
     tt.return
   }
   // CHECK:      tt.func public @test_store([[PARAM_0:%.+]]: !tt.ptr<f32>, [[PARAM_1:%.+]]: i32, [[PARAM_2:%.+]]: i32) {
@@ -40,10 +48,13 @@ module {
   // CHECK-DAG:    [[CST_64_i32:%.+]] = arith.constant 64 : i32
   // CHECK-DAG:    [[CST_8_i32:%.+]] = arith.constant 8 : i32
   // CHECK-DAG:    [[CST:%.+]] = arith.constant dense<1.000000e+00> : tensor<16x128xf32>
+  // CHECK-DAG:    [[CST1:%.+]] = arith.constant dense<1.000000e+00> : tensor<16x128xf32, #[[$BLOCKED]]>
   // CHECK-DAG:    [[EXTSI_PARAM_1:%.+]] = arith.extsi [[PARAM_1]] : i32 to i64
   // CHECK-DAG:    [[EXTSI_PARAM_2:%.+]] = arith.extsi [[PARAM_2]] : i32 to i64
-  // CHECK:        [[TENSOR_PTR:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
-  // CHECK:        tt.store [[TENSOR_PTR]], [[CST]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR1:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32>>
+  // CHECK:        tt.store [[TENSOR_PTR1]], [[CST]] : !tt.ptr<tensor<16x128xf32>>
+  // CHECK:        [[TENSOR_PTR2:%.+]] = tt.make_tensor_ptr [[PARAM_0]], {{\[}}[[EXTSI_PARAM_1]], [[EXTSI_PARAM_2]]], {{\[}}[[EXTSI_PARAM_2]], [[CST_1_i64]]], {{\[}}[[CST_8_i32]], [[CST_64_i32]]] {{.*}} : <tensor<16x128xf32, #[[$BLOCKED]]>>
+  // CHECK:        tt.store [[TENSOR_PTR2]], [[CST1]] : !tt.ptr<tensor<16x128xf32, #[[$BLOCKED]]>>
   // CHECK:        tt.return
   // CHECK:      }
 }
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp
@@ -1,13 +1,12 @@
 #include "intel/include/Dialect/Triton/Transforms/Passes.h"
+#include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Verifier.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "triton-intel-tdesc-to-block-pointer"
@@ -121,11 +120,19 @@ struct TritonIntelTensorDescToBlockPointer
     return (yieldedVal != blockArg);
   }
 
+  // Create a new block pointer if a suitable one doesn't already exist.
+  // Otherwise, return the existing one. The function takes the base, shape,
+  // strides, offsets, sizes of the block pointer to create/lookup and its
+  // tensor element type (to ensure the block pointer has the tensor layout).
   Value findOrCreateMakeTensorPtr(Location loc, Value base, ValueRange shape,
                                   ValueRange strides, ValueRange offsets,
-                                  ArrayRef<int32_t> sizes, OpBuilder &builder) {
+                                  ArrayRef<int32_t> sizes,
+                                  RankedTensorType tensorType,
+                                  OpBuilder &builder) {
     Block *block = builder.getInsertionBlock();
     const Block::iterator insertPoint = builder.getInsertionPoint();
+    auto ptrType = tt::PointerType::get(
+        tensorType, tt::TritonGEN::TritonGENMemorySpace::kCrossWorkgroup);
 
     auto it = std::find_if(block->begin(), insertPoint, [&](Operation &op) {
       if (auto makeTensorPtrOp = dyn_cast<tt::MakeTensorPtrOp>(op)) {
@@ -138,7 +145,9 @@ struct TritonIntelTensorDescToBlockPointer
           }
           return true;
         };
-        return makeTensorPtrOp.getBase() == base &&
+
+        return makeTensorPtrOp.getType() == ptrType &&
+               makeTensorPtrOp.getBase() == base &&
                makeTensorPtrOp.getShape() == shape &&
                makeTensorPtrOp.getStrides() == strides &&
                makeTensorPtrOp.getOffsets() == offsets &&
@@ -147,10 +156,16 @@ struct TritonIntelTensorDescToBlockPointer
       return false;
     });
 
+    auto makeTensorPtrOp = [&]() {
+      Value makeTensorPtr = builder.create<tt::MakeTensorPtrOp>(
+          loc, base, shape, strides, offsets, sizes,
+          builder.getDenseI32ArrayAttr({1, 0}));
+      makeTensorPtr.setType(ptrType);
+      return makeTensorPtr;
+    };
+
     return (it != insertPoint) ? cast<tt::MakeTensorPtrOp>(*it)
-                               : builder.createOrFold<tt::MakeTensorPtrOp>(
-                                     loc, base, shape, strides, offsets, sizes,
-                                     builder.getDenseI32ArrayAttr({1, 0}));
+                               : makeTensorPtrOp();
   }
 
   template <typename OpTy,
@@ -176,6 +191,11 @@ struct TritonIntelTensorDescToBlockPointer
 
     LLVM_DEBUG(llvm::dbgs() << "which has tdesc: " << makeTensorDescOp << "\n");
 
+    auto createPointerType = [](RankedTensorType tensorType) {
+      return tt::PointerType::get(
+          tensorType, tt::TritonGEN::TritonGENMemorySpace::kCrossWorkgroup);
+    };
+
     // Create a new block pointer if a suitable one doesn't already exist.
     SmallVector<Value> shapes, strides, offsets;
     SmallVector<int32_t> sizes;
@@ -193,16 +213,22 @@ struct TritonIntelTensorDescToBlockPointer
       sizes.push_back(static_cast<int32_t>(size));
     }
 
+    constexpr bool isLoad = std::is_same_v<OpTy, tt::DescriptorLoadOp>;
+    RankedTensorType tensorType;
+    if constexpr (isLoad)
+      tensorType = op.getResult().getType();
+    else
+      tensorType = op.getSrc().getType();
+
     Value makeTensorPtrOp =
         findOrCreateMakeTensorPtr(loc, makeTensorDescOp.getBase(), shapes,
-                                  strides, offsets, sizes, builder);
+                                  strides, offsets, sizes, tensorType, builder);
 
     LLVM_DEBUG({
       llvm::dbgs() << "With:\n";
       llvm::dbgs().indent(2) << makeTensorPtrOp << "\n";
     });
 
-    constexpr bool isLoad = std::is_same_v<OpTy, tt::DescriptorLoadOp>;
     if constexpr (isLoad) {
       auto loadOp = builder.createOrFold<tt::LoadOp>(
           loc, makeTensorPtrOp, op.getCache(), op.getEvict(),