Skip to content

Commit e3d441a

Browse files
etiottowhitneywhtsang
authored andcommitted
Fix failing CI test
Signed-off-by: Tiotto, Ettore <[email protected]>
1 parent 78b26b8 commit e3d441a

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

test/TritonIntelGPU/prefetch-to-llvm.mlir

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,3 +156,16 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32}
156156
tt.return
157157
}
158158
}
159+
160+
// -----
161+
162+
// COM: Currently the prefetch operation in this test cannot be lowered correctly, so we check that the test compiles cleanly and not 2D block prefetch operation gets generated.
163+
#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [4, 1], repCluster = [4, 1], A = [32, 8], B = [8, 16], C = [32, 16]}>
164+
module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} {
165+
// CHECK-LABEL: llvm.func spir_kernelcc @kernel
166+
tt.func public @kernel(%arg0 : tensor<128x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>) {
167+
// CHECK-NOT: intel_sub_group_2d_block_prefetch
168+
triton_intel_gpu.prefetch %arg0 {boundaryCheck = array<i32>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, operandSegmentSizes = array<i32: 1, 0, 0>, triton_intel_gpu.block_io = "row_major"} : tensor<128x32x!tt.ptr<f32>, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
169+
tt.return
170+
}
171+
}

third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -325,10 +325,18 @@ struct PrefetchOpConversion
325325
LogicalResult
326326
matchAndRewrite(triton::gpu::intel::PrefetchOp op, OpAdaptor adaptor,
327327
ConversionPatternRewriter &rewriter) const final {
328-
Value ptr = op.getPtr();
329-
if (isTensorPointerType(ptr.getType()))
330-
return rewriteTensorPointerPrefetch(op, adaptor, rewriter);
331-
return rewriteRegularPointerPrefetch(op, adaptor, rewriter);
328+
LogicalResult res =
329+
isTensorPointerType(op.getPtr().getType())
330+
? rewriteTensorPointerPrefetch(op, adaptor, rewriter)
331+
: rewriteRegularPointerPrefetch(op, adaptor, rewriter);
332+
333+
// FIXME: the prefetch lowering code should never fail. Currently it does in
334+
// some cases. We should address those cases instead of removing the
335+
// prefetch operation.
336+
if (failed(res))
337+
rewriter.eraseOp(op);
338+
339+
return success();
332340
}
333341

334342
LogicalResult
@@ -641,6 +649,11 @@ struct PrefetchOpConversion
641649
masks[offset] = maskElems[i];
642650
}
643651

652+
// baseAddrs[{0, 0}] and baseAddrs[{1, 0}] are currently used to calculate
653+
// the pitch.
654+
if (baseAddrs.count({0, 0}) == 0 || baseAddrs.count({1, 0}) == 0)
655+
return failure();
656+
644657
Value base, baseWidth, baseHeight, rowStrideInBytes, colStride, offsetBaseX,
645658
offsetBaseY;
646659

0 commit comments

Comments
 (0)