intel · whitneywhtsang · May 2, 2025 · Apr 30, 2025 · May 1, 2025 · May 1, 2025
diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true" | FileCheck %s
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3" | FileCheck %s
 
 // CHECK: #[[$BLOCK_0:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
 // CHECK: #[[$BLOCK_1:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>

diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE
-// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 support-regular-ptr=true split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=workgroup" | FileCheck %s --check-prefixes=CHECK,WORKGROUP_SCOPE
+// RUN: triton-opt %s -split-input-file -tritonintelgpu-pipeline="num-stages=3 split-barriers-scope=subgroup" | FileCheck %s --check-prefixes=CHECK,SUBGROUP_SCOPE
 
 // CHECK: #[[$BLOCK:.+]] = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 16], warpsPerCTA = [8, 4], order = [1, 0]}>
 // CHECK: #[[$DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 8], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -310,7 +310,7 @@ def make_ttgir(mod, metadata, opt, properties):
         intel.passes.ttgpuir.add_accelerate_matmul(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_materialize_block_pointer(pm)
-        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False, XPUBackend.get_split_barrier_scope(opt))
+        intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, XPUBackend.get_split_barrier_scope(opt))
 
         passes.ttgpuir.add_fuse_nested_loops(pm)
         passes.ttgpuir.add_optimize_thread_locality(pm)

diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -122,9 +122,6 @@ def TritonIntelGPUPipeline : Pass<"tritonintelgpu-pipeline", "mlir::ModuleOp"> {
     Option<"numStages", "num-stages",
            "int32_t", /*default*/"3",
            "number of pipeline stages">,
-    Option<"supportRegularPtr", "support-regular-ptr",
-           "bool", /*default*/"false",
-           "Enable support for prefetching non-block pointers">,
     Option<"splitBarrierScope", "split-barriers-scope",
            "enum SplitBarrierScope", "SplitBarrierScope::None",
            "insert split barriers in a loop",

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -117,8 +117,7 @@ static std::optional<LoadDotOperand> loadDotOperand(tt::LoadOp loadOp) {
 
 /// Collect loads to pipeline. Return success if we can pipeline this loop.
 static void collectOpsToPipeline(scf::ForOp forOp,
-                                 SmallVectorImpl<LoadDotOperand> &loadOps,
-                                 bool supportRegularPtr) {
+                                 SmallVectorImpl<LoadDotOperand> &loadOps) {
   assert(loadOps.empty() && "Expecting an empty list of load operations");
 
   ModuleOp moduleOp = forOp->getParentOfType<ModuleOp>();
@@ -128,11 +127,6 @@ static void collectOpsToPipeline(scf::ForOp forOp,
   // operations in the loop body block.
   for (Operation &op : forOp) {
     if (auto loadOp = dyn_cast<tt::LoadOp>(&op)) {
-      Value ptr = loadOp.getPtr();
-      bool isBlockPtr = mlir::triton::isTensorPointerType(ptr.getType());
-      if (!isBlockPtr && !supportRegularPtr)
-        continue;
-
       // In order to avoid polluting the cache, do not prefetch loads unless the
       // memory they reference is densely structured.
       Attribute blockIOAttr =
@@ -309,12 +303,11 @@ createSchedule(scf::ForOp forOp, int numStages) {
 }
 
 bool ttgi::preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
-                                        bool supportRegularPtr,
                                         mlir::scf::PipeliningOption &options) {
   // 1. First collect "interesting" operations with a stage where to schedule
   // them. This gives a coarse scheduling for the loop.
   SmallVector<LoadDotOperand> loads;
-  collectOpsToPipeline(forOp, loads, supportRegularPtr);
+  collectOpsToPipeline(forOp, loads);
   if (loads.empty()) {
     LDBG("No loads to pipeline");
     return false;

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/Schedule.h
@@ -6,7 +6,6 @@
 namespace mlir::triton::gpu::intel {
 
 bool preProcessLoopAndGetSchedule(scf::ForOp &forOp, int numStages,
-                                  bool supportRegularPtr,
                                   mlir::scf::PipeliningOption &options);
 
 } // namespace mlir::triton::gpu::intel

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
@@ -39,14 +39,14 @@ static bool preCondition(scf::ForOp forOp) {
 }
 
 static void
-pipelineLoop(scf::ForOp forOp, int numStages, bool supportRegularPtr,
+pipelineLoop(scf::ForOp forOp, int numStages,
              std::optional<spirv::Scope> barrierScope = std::nullopt) {
   mlir::scf::PipeliningOption options;
   if (!preCondition(forOp))
     return;
 
-  bool foundSchedule = ttgi::preProcessLoopAndGetSchedule(
-      forOp, numStages, supportRegularPtr, options);
+  bool foundSchedule =
+      ttgi::preProcessLoopAndGetSchedule(forOp, numStages, options);
   if (!foundSchedule)
     return;
 
@@ -107,9 +107,8 @@ struct IntelGPUPipelinePass
     SmallVector<scf::ForOp> loops;
     getOperation()->walk([&](scf::ForOp forOp) { loops.push_back(forOp); });
 
-    for (scf::ForOp forOp : loops) {
-      pipelineLoop(forOp, numStages, supportRegularPtr, barrierScope);
-    }
+    for (scf::ForOp forOp : loops)
+      pipelineLoop(forOp, numStages, barrierScope);
   }
 };
 } // anonymous namespace
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -88,8 +88,8 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUAccelerateMatmul);
   ADD_PASS_WRAPPER_0("add_rewrite_stack_ptr",
                      gpu::intel::createTritonIntelGPURewriteStackPtr);
-  ADD_PASS_WRAPPER_OPT_3("add_pipeline",
-                         gpu::intel::createTritonIntelGPUPipeline, int, bool,
+  ADD_PASS_WRAPPER_OPT_2("add_pipeline",
+                         gpu::intel::createTritonIntelGPUPipeline, int,
                          enum gpu::intel::SplitBarrierScope);
   ADD_PASS_WRAPPER_0("add_remove_layout_conversions",
                      gpu::intel::createTritonIntelGPURemoveLayoutConversions);