From 9e44f5521ba3a4f0637de732143b0f172b84eb5b Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 11 Jun 2025 01:31:12 -0400 Subject: [PATCH 1/5] [NVPTX] add combiner rule for final packed op in reduction --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 114 +++++- .../CodeGen/NVPTX/reduction-intrinsics.ll | 340 ++++++------------ 2 files changed, 210 insertions(+), 244 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 492f4ab76fdbb..e2aa907ed8eb8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -852,6 +852,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Combine reduction operations on packed types (e.g. fadd.f16x2) with vector + // shuffles when one of their lanes is a no-op. + if (STI.allowFP16Math() || STI.hasBF16Math()) + // already added above: FADD, ADD, AND + setTargetDAGCombine({ISD::FMUL, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::UMIN, + ISD::UMAX, ISD::SMIN, ISD::SMAX, ISD::OR, ISD::XOR}); + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -5069,20 +5076,102 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) { return PerformStoreCombineHelper(N, 2, 0); } +/// For vector reductions, the final result needs to be a scalar. The default +/// expansion will use packed ops (ex. fadd.f16x2) even for the final operation. +/// This requires a packed operation where one of the lanes is undef. +/// +/// ex: lowering of vecreduce_fadd(V) where V = v4f16 +/// +/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) +/// v2: v2f16 = vector_shuffle<1,u> v1, undef:v2f16 (== ) +/// v3: v2f16 = fadd reassoc v2, v1 (== ) +/// vR: f16 = extractelt v3, 1 +/// +/// We wish to replace vR, v3, and v2 with: +/// vR: f16 = fadd reassoc (extractelt v1, 1) (extractelt v1, 0) +/// +/// ...so that we get: +/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) +/// s1: f16 = extractelt v1, 1 +/// s2: f16 = extractelt v1, 0 +/// vR: f16 = fadd reassoc s1, s2 (== a+c+b+d) +/// +/// So for this example, this rule will replace v3 and v2, returning a vector +/// with the result in lane 0 and an undef in lane 1, which we expect will be +/// folded into the extractelt in vR. +static SDValue PerformPackedOpCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Convert: + // (fop.x2 (vector_shuffle A), B) -> ((fop A:i, B:0), undef) + // ...or... + // (fop.x2 (vector_shuffle A), B) -> (undef, (fop A:i, B:1)) + // ...where i is a valid index and u is poison. + const EVT VectorVT = N->getValueType(0); + if (!Isv2x16VT(VectorVT)) + return SDValue(); + + SDLoc DL(N); + + SDValue ShufOp = N->getOperand(0); + SDValue VectOp = N->getOperand(1); + bool Swapped = false; + + // canonicalize shuffle to op0 + if (VectOp.getOpcode() == ISD::VECTOR_SHUFFLE) { + std::swap(ShufOp, VectOp); + Swapped = true; + } + + if (ShufOp.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *ShuffleOp = cast(ShufOp); + int LiveLane; // exclusively live lane + for (LiveLane = 0; LiveLane < 2; ++LiveLane) { + // check if the current lane is live and the other lane is dead + if (ShuffleOp->getMaskElt(LiveLane) != PoisonMaskElem && + ShuffleOp->getMaskElt(!LiveLane) == PoisonMaskElem) + break; + } + if (LiveLane == 2) + return SDValue(); + + int ElementIdx = ShuffleOp->getMaskElt(LiveLane); + const EVT ScalarVT = VectorVT.getScalarType(); + SDValue Lanes[2] = {}; + for (auto [LaneID, LaneVal] : enumerate(Lanes)) { + if (LaneID == (unsigned)LiveLane) { + SDValue Operands[2] = { + DCI.DAG.getExtractVectorElt(DL, ScalarVT, ShufOp.getOperand(0), + ElementIdx), + DCI.DAG.getExtractVectorElt(DL, ScalarVT, VectOp, LiveLane)}; + // preserve the order of operands + if (Swapped) + std::swap(Operands[0], Operands[1]); + LaneVal = DCI.DAG.getNode(N->getOpcode(), DL, ScalarVT, Operands); + } else { + LaneVal = DCI.DAG.getUNDEF(ScalarVT); + } + } + return DCI.DAG.getBuildVector(VectorVT, DL, Lanes); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel) { - if (OptLevel == CodeGenOptLevel::None) - return SDValue(); - SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Skip non-integer, non-scalar case EVT VT = N0.getValueType(); - if (VT.isVector() || VT != MVT::i32) + if (VT.isVector()) + return PerformPackedOpCombine(N, DCI); + if (VT != MVT::i32) + return SDValue(); + + if (OptLevel == CodeGenOptLevel::None) return SDValue(); // First try with the default operand order. @@ -5102,7 +5191,10 @@ static SDValue PerformFADDCombine(SDNode *N, SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); - if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) + if (VT.isVector()) + return PerformPackedOpCombine(N, DCI); + + if (!(VT == MVT::f32 || VT == MVT::f64)) return SDValue(); // First try with the default operand order. @@ -5205,7 +5297,7 @@ static SDValue PerformANDCombine(SDNode *N, DCI.CombineTo(N, Val, AddTo); } - return SDValue(); + return PerformPackedOpCombine(N, DCI); } static SDValue PerformREMCombine(SDNode *N, @@ -5686,6 +5778,16 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformADDCombine(N, DCI, OptLevel); case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); + case ISD::FMUL: + case ISD::FMINNUM: + case ISD::FMAXIMUM: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::OR: + case ISD::XOR: + return PerformPackedOpCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, OptLevel); case ISD::SHL: diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index d5b451dad7bc3..ca03550bdefcd 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -5,10 +5,10 @@ ; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} -; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" @@ -43,45 +43,22 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<6>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<6>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fadd_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -205,41 +182,20 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fmul_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -401,7 +357,6 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmax_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -423,7 +378,6 @@ define float @reduce_fmax_float(<8 x float> %in) { } define float @reduce_fmax_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -445,7 +399,6 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { } define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -533,7 +486,6 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmin_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmin_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -555,7 +507,6 @@ define float @reduce_fmin_float(<8 x float> %in) { } define float @reduce_fmin_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmin_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -665,7 +616,6 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmaximum_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -687,7 +637,6 @@ define float @reduce_fmaximum_float(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -709,7 +658,6 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -797,7 +745,6 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fminimum_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -819,7 +766,6 @@ define float @reduce_fminimum_float(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -841,7 +787,6 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -888,20 +833,17 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1114,20 +1056,17 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1248,20 +1187,17 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1382,20 +1318,17 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1516,20 +1449,17 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1625,43 +1555,21 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_and_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_and_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_and_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1736,43 +1644,21 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_or_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_or_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_or_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1847,43 +1733,21 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_xor_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_xor_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_xor_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } From ef58bcb1567c94caf098b6788cc5dac3e11b0fc7 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Fri, 13 Jun 2025 14:23:13 -0400 Subject: [PATCH 2/5] Revert "[NVPTX] add combiner rule for final packed op in reduction" This reverts commit 8cbda008607cbcdd8df3a0cc5994349d81fc81b5. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 114 +----- .../CodeGen/NVPTX/reduction-intrinsics.ll | 340 ++++++++++++------ 2 files changed, 244 insertions(+), 210 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index e2aa907ed8eb8..492f4ab76fdbb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -852,13 +852,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); - // Combine reduction operations on packed types (e.g. fadd.f16x2) with vector - // shuffles when one of their lanes is a no-op. - if (STI.allowFP16Math() || STI.hasBF16Math()) - // already added above: FADD, ADD, AND - setTargetDAGCombine({ISD::FMUL, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::UMIN, - ISD::UMAX, ISD::SMIN, ISD::SMAX, ISD::OR, ISD::XOR}); - // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -5076,102 +5069,20 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) { return PerformStoreCombineHelper(N, 2, 0); } -/// For vector reductions, the final result needs to be a scalar. The default -/// expansion will use packed ops (ex. fadd.f16x2) even for the final operation. -/// This requires a packed operation where one of the lanes is undef. -/// -/// ex: lowering of vecreduce_fadd(V) where V = v4f16 -/// -/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) -/// v2: v2f16 = vector_shuffle<1,u> v1, undef:v2f16 (== ) -/// v3: v2f16 = fadd reassoc v2, v1 (== ) -/// vR: f16 = extractelt v3, 1 -/// -/// We wish to replace vR, v3, and v2 with: -/// vR: f16 = fadd reassoc (extractelt v1, 1) (extractelt v1, 0) -/// -/// ...so that we get: -/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) -/// s1: f16 = extractelt v1, 1 -/// s2: f16 = extractelt v1, 0 -/// vR: f16 = fadd reassoc s1, s2 (== a+c+b+d) -/// -/// So for this example, this rule will replace v3 and v2, returning a vector -/// with the result in lane 0 and an undef in lane 1, which we expect will be -/// folded into the extractelt in vR. -static SDValue PerformPackedOpCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Convert: - // (fop.x2 (vector_shuffle A), B) -> ((fop A:i, B:0), undef) - // ...or... - // (fop.x2 (vector_shuffle A), B) -> (undef, (fop A:i, B:1)) - // ...where i is a valid index and u is poison. - const EVT VectorVT = N->getValueType(0); - if (!Isv2x16VT(VectorVT)) - return SDValue(); - - SDLoc DL(N); - - SDValue ShufOp = N->getOperand(0); - SDValue VectOp = N->getOperand(1); - bool Swapped = false; - - // canonicalize shuffle to op0 - if (VectOp.getOpcode() == ISD::VECTOR_SHUFFLE) { - std::swap(ShufOp, VectOp); - Swapped = true; - } - - if (ShufOp.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - - auto *ShuffleOp = cast(ShufOp); - int LiveLane; // exclusively live lane - for (LiveLane = 0; LiveLane < 2; ++LiveLane) { - // check if the current lane is live and the other lane is dead - if (ShuffleOp->getMaskElt(LiveLane) != PoisonMaskElem && - ShuffleOp->getMaskElt(!LiveLane) == PoisonMaskElem) - break; - } - if (LiveLane == 2) - return SDValue(); - - int ElementIdx = ShuffleOp->getMaskElt(LiveLane); - const EVT ScalarVT = VectorVT.getScalarType(); - SDValue Lanes[2] = {}; - for (auto [LaneID, LaneVal] : enumerate(Lanes)) { - if (LaneID == (unsigned)LiveLane) { - SDValue Operands[2] = { - DCI.DAG.getExtractVectorElt(DL, ScalarVT, ShufOp.getOperand(0), - ElementIdx), - DCI.DAG.getExtractVectorElt(DL, ScalarVT, VectOp, LiveLane)}; - // preserve the order of operands - if (Swapped) - std::swap(Operands[0], Operands[1]); - LaneVal = DCI.DAG.getNode(N->getOpcode(), DL, ScalarVT, Operands); - } else { - LaneVal = DCI.DAG.getUNDEF(ScalarVT); - } - } - return DCI.DAG.getBuildVector(VectorVT, DL, Lanes); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Skip non-integer, non-scalar case EVT VT = N0.getValueType(); - if (VT.isVector()) - return PerformPackedOpCombine(N, DCI); - if (VT != MVT::i32) - return SDValue(); - - if (OptLevel == CodeGenOptLevel::None) + if (VT.isVector() || VT != MVT::i32) return SDValue(); // First try with the default operand order. @@ -5191,10 +5102,7 @@ static SDValue PerformFADDCombine(SDNode *N, SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); - if (VT.isVector()) - return PerformPackedOpCombine(N, DCI); - - if (!(VT == MVT::f32 || VT == MVT::f64)) + if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) return SDValue(); // First try with the default operand order. @@ -5297,7 +5205,7 @@ static SDValue PerformANDCombine(SDNode *N, DCI.CombineTo(N, Val, AddTo); } - return PerformPackedOpCombine(N, DCI); + return SDValue(); } static SDValue PerformREMCombine(SDNode *N, @@ -5778,16 +5686,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformADDCombine(N, DCI, OptLevel); case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); - case ISD::FMUL: - case ISD::FMINNUM: - case ISD::FMAXIMUM: - case ISD::UMIN: - case ISD::UMAX: - case ISD::SMIN: - case ISD::SMAX: - case ISD::OR: - case ISD::XOR: - return PerformPackedOpCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, OptLevel); case ISD::SHL: diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index ca03550bdefcd..d5b451dad7bc3 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -5,10 +5,10 @@ ; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} -; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ +; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" @@ -43,22 +43,45 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-LABEL: reduce_fadd_half_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<6>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<6>; +; CHECK-SM80-NEXT: .reg .b32 %r<10>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<6>; +; CHECK-SM100-NEXT: .reg .b32 %r<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM100-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -182,20 +205,41 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-LABEL: reduce_fmul_half_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<10>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM100-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -357,6 +401,7 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmax_float(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmax_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -378,6 +423,7 @@ define float @reduce_fmax_float(<8 x float> %in) { } define float @reduce_fmax_float_reassoc(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmax_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -399,6 +445,7 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { } define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { +; ; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -486,6 +533,7 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmin_float(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmin_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -507,6 +555,7 @@ define float @reduce_fmin_float(<8 x float> %in) { } define float @reduce_fmin_float_reassoc(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmin_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -616,6 +665,7 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmaximum_float(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmaximum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -637,6 +687,7 @@ define float @reduce_fmaximum_float(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fmaximum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -658,6 +709,7 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { +; ; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -745,6 +797,7 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fminimum_float(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fminimum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -766,6 +819,7 @@ define float @reduce_fminimum_float(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc(<8 x float> %in) { +; ; CHECK-LABEL: reduce_fminimum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -787,6 +841,7 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { +; ; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -833,17 +888,20 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1056,17 +1114,20 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1187,17 +1248,20 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1318,17 +1382,20 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1449,17 +1516,20 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1555,21 +1625,43 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_and_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_and_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_and_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1644,21 +1736,43 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_or_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_or_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_or_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1733,21 +1847,43 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_xor_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_xor_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_xor_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } From a0b4c9890afa7e840980a2217271718f97e8a2f6 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Fri, 13 Jun 2025 20:09:54 -0400 Subject: [PATCH 3/5] [DAGCombiner] add rule for vector reduction op with undef lane The result of a reduction needs to be a scalar. When expressed as a sequence of vector ops, the final op needs to reduce two (or more) lanes belonging to the same register. On some targets a shuffle is not available and this results in two extra movs to setup another vector register with the lanes swapped. This pattern is now handled better by turning it into a scalar op. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 95 +++++ .../CodeGen/NVPTX/reduction-intrinsics.ll | 325 ++++++------------ 2 files changed, 195 insertions(+), 225 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0e078f9dd88b4..aaecec3855055 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -582,6 +582,7 @@ namespace { SDValue reassociateReduction(unsigned RedOpc, unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SDNodeFlags Flags = SDNodeFlags()); + SDValue foldReductionWithUndefLane(SDNode *N); SDValue visitShiftByConstant(SDNode *N); @@ -1349,6 +1350,75 @@ SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc, return SDValue(); } +// Convert: +// (op.x2 (vector_shuffle A), B) -> <(op A:i, B:0) undef> +// ...or... +// (op.x2 (vector_shuffle A), B) -> +// ...where i is a valid index and u is poison. +SDValue DAGCombiner::foldReductionWithUndefLane(SDNode *N) { + const EVT VectorVT = N->getValueType(0); + + // Only support 2-packed vectors for now. + if (!VectorVT.isVector() || VectorVT.isScalableVector() + || VectorVT.getVectorNumElements() != 2) + return SDValue(); + + // If the operation is already unsupported, we don't need to do this + // operation. + if (!TLI.isOperationLegal(N->getOpcode(), VectorVT)) + return SDValue(); + + // If vector shuffle is supported on the target, this optimization may + // increase register pressure. + if (TLI.isOperationLegalOrCustomOrPromote(ISD::VECTOR_SHUFFLE, VectorVT)) + return SDValue(); + + SDLoc DL(N); + + SDValue ShufOp = N->getOperand(0); + SDValue VectOp = N->getOperand(1); + bool Swapped = false; + + // canonicalize shuffle op + if (VectOp.getOpcode() == ISD::VECTOR_SHUFFLE) { + std::swap(ShufOp, VectOp); + Swapped = true; + } + + if (ShufOp.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *ShuffleOp = cast(ShufOp); + int LiveLane; // exclusively live lane + for (LiveLane = 0; LiveLane < 2; ++LiveLane) { + // check if the current lane is live and the other lane is dead + if (ShuffleOp->getMaskElt(LiveLane) != PoisonMaskElem && + ShuffleOp->getMaskElt(!LiveLane) == PoisonMaskElem) + break; + } + if (LiveLane == 2) + return SDValue(); + + const int ElementIdx = ShuffleOp->getMaskElt(LiveLane); + const EVT ScalarVT = VectorVT.getScalarType(); + SDValue Lanes[2] = {}; + for (auto [LaneID, LaneVal] : enumerate(Lanes)) { + if (LaneID == (unsigned)LiveLane) { + SDValue Operands[2] = { + DAG.getExtractVectorElt(DL, ScalarVT, ShufOp.getOperand(0), + ElementIdx), + DAG.getExtractVectorElt(DL, ScalarVT, VectOp, LiveLane)}; + // preserve the order of operands + if (Swapped) + std::swap(Operands[0], Operands[1]); + LaneVal = DAG.getNode(N->getOpcode(), DL, ScalarVT, Operands); + } else { + LaneVal = DAG.getUNDEF(ScalarVT); + } + } + return DAG.getBuildVector(VectorVT, DL, Lanes); +} + SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); @@ -3058,6 +3128,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV); } + if (SDValue R = foldReductionWithUndefLane(N)) + return R; + return SDValue(); } @@ -6001,6 +6074,9 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDLoc(N), VT, N0, N1)) return SD; + if (SDValue SD = foldReductionWithUndefLane(N)) + return SD; + // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -7301,6 +7377,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } } + + if (SDValue R = foldReductionWithUndefLane(N)) + return R; } // fold (and x, -1) -> x @@ -8260,6 +8339,9 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } } + + if (SDValue R = foldReductionWithUndefLane(N)) + return R; } // fold (or x, 0) -> x @@ -9941,6 +10023,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; + if (SDValue R = foldReductionWithUndefLane(N)) + return R; + return SDValue(); } @@ -17557,6 +17642,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { AddToWorklist(Fused.getNode()); return Fused; } + + if (SDValue R = foldReductionWithUndefLane(N)) + return R; + return SDValue(); } @@ -17925,6 +18014,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (SDValue R = combineFMulOrFDivWithIntPow2(N)) return R; + if (SDValue R = foldReductionWithUndefLane(N)) + return R; + return SDValue(); } @@ -19030,6 +19122,9 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { Opc, SDLoc(N), VT, N0, N1, Flags)) return SD; + if (SDValue SD = foldReductionWithUndefLane(N)) + return SD; + return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index d5b451dad7bc3..caebdcb19c1e0 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -43,45 +43,22 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<6>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<6>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fadd_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -205,41 +182,20 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fmul_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -888,20 +844,17 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1114,20 +1067,17 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1248,20 +1198,17 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1382,20 +1329,17 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1516,20 +1460,17 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1625,43 +1566,21 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_and_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_and_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_and_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1736,43 +1655,21 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_or_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_or_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_or_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1847,43 +1744,21 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_xor_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_xor_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_xor_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } From ec1faa0826b45a827b58678c9b62be7c26dd291b Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 18 Jun 2025 12:47:19 -0400 Subject: [PATCH 4/5] Revert "[DAGCombiner] add rule for vector reduction op with undef lane" This reverts commit 371a3ad352b19ff42190e64f0360d9f79245e25f. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 95 ----- .../CodeGen/NVPTX/reduction-intrinsics.ll | 325 ++++++++++++------ 2 files changed, 225 insertions(+), 195 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aaecec3855055..0e078f9dd88b4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -582,7 +582,6 @@ namespace { SDValue reassociateReduction(unsigned RedOpc, unsigned Opc, const SDLoc &DL, EVT VT, SDValue N0, SDValue N1, SDNodeFlags Flags = SDNodeFlags()); - SDValue foldReductionWithUndefLane(SDNode *N); SDValue visitShiftByConstant(SDNode *N); @@ -1350,75 +1349,6 @@ SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc, return SDValue(); } -// Convert: -// (op.x2 (vector_shuffle A), B) -> <(op A:i, B:0) undef> -// ...or... -// (op.x2 (vector_shuffle A), B) -> -// ...where i is a valid index and u is poison. -SDValue DAGCombiner::foldReductionWithUndefLane(SDNode *N) { - const EVT VectorVT = N->getValueType(0); - - // Only support 2-packed vectors for now. - if (!VectorVT.isVector() || VectorVT.isScalableVector() - || VectorVT.getVectorNumElements() != 2) - return SDValue(); - - // If the operation is already unsupported, we don't need to do this - // operation. - if (!TLI.isOperationLegal(N->getOpcode(), VectorVT)) - return SDValue(); - - // If vector shuffle is supported on the target, this optimization may - // increase register pressure. - if (TLI.isOperationLegalOrCustomOrPromote(ISD::VECTOR_SHUFFLE, VectorVT)) - return SDValue(); - - SDLoc DL(N); - - SDValue ShufOp = N->getOperand(0); - SDValue VectOp = N->getOperand(1); - bool Swapped = false; - - // canonicalize shuffle op - if (VectOp.getOpcode() == ISD::VECTOR_SHUFFLE) { - std::swap(ShufOp, VectOp); - Swapped = true; - } - - if (ShufOp.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - - auto *ShuffleOp = cast(ShufOp); - int LiveLane; // exclusively live lane - for (LiveLane = 0; LiveLane < 2; ++LiveLane) { - // check if the current lane is live and the other lane is dead - if (ShuffleOp->getMaskElt(LiveLane) != PoisonMaskElem && - ShuffleOp->getMaskElt(!LiveLane) == PoisonMaskElem) - break; - } - if (LiveLane == 2) - return SDValue(); - - const int ElementIdx = ShuffleOp->getMaskElt(LiveLane); - const EVT ScalarVT = VectorVT.getScalarType(); - SDValue Lanes[2] = {}; - for (auto [LaneID, LaneVal] : enumerate(Lanes)) { - if (LaneID == (unsigned)LiveLane) { - SDValue Operands[2] = { - DAG.getExtractVectorElt(DL, ScalarVT, ShufOp.getOperand(0), - ElementIdx), - DAG.getExtractVectorElt(DL, ScalarVT, VectOp, LiveLane)}; - // preserve the order of operands - if (Swapped) - std::swap(Operands[0], Operands[1]); - LaneVal = DAG.getNode(N->getOpcode(), DL, ScalarVT, Operands); - } else { - LaneVal = DAG.getUNDEF(ScalarVT); - } - } - return DAG.getBuildVector(VectorVT, DL, Lanes); -} - SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); @@ -3128,9 +3058,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV); } - if (SDValue R = foldReductionWithUndefLane(N)) - return R; - return SDValue(); } @@ -6074,9 +6001,6 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDLoc(N), VT, N0, N1)) return SD; - if (SDValue SD = foldReductionWithUndefLane(N)) - return SD; - // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); @@ -7377,9 +7301,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } } - - if (SDValue R = foldReductionWithUndefLane(N)) - return R; } // fold (and x, -1) -> x @@ -8339,9 +8260,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } } - - if (SDValue R = foldReductionWithUndefLane(N)) - return R; } // fold (or x, 0) -> x @@ -10023,9 +9941,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N)) return Combined; - if (SDValue R = foldReductionWithUndefLane(N)) - return R; - return SDValue(); } @@ -17642,10 +17557,6 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { AddToWorklist(Fused.getNode()); return Fused; } - - if (SDValue R = foldReductionWithUndefLane(N)) - return R; - return SDValue(); } @@ -18014,9 +17925,6 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (SDValue R = combineFMulOrFDivWithIntPow2(N)) return R; - if (SDValue R = foldReductionWithUndefLane(N)) - return R; - return SDValue(); } @@ -19122,9 +19030,6 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) { Opc, SDLoc(N), VT, N0, N1, Flags)) return SD; - if (SDValue SD = foldReductionWithUndefLane(N)) - return SD; - return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index caebdcb19c1e0..d5b451dad7bc3 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -43,22 +43,45 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-LABEL: reduce_fadd_half_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<6>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<6>; +; CHECK-SM80-NEXT: .reg .b32 %r<10>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<6>; +; CHECK-SM100-NEXT: .reg .b32 %r<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-SM100-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -182,20 +205,41 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-LABEL: reduce_fmul_half_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<10>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-SM100-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -844,17 +888,20 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1067,17 +1114,20 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1198,17 +1248,20 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1329,17 +1382,20 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1460,17 +1516,20 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<9>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; -; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1566,21 +1625,43 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_and_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_and_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_and_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1655,21 +1736,43 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_or_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_or_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_or_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1744,21 +1847,43 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-LABEL: reduce_xor_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; -; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_xor_i16( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b16 %rs<4>; +; CHECK-SM80-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } +; CHECK-SM80-NEXT: // implicit-def: %rs2 +; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; +; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } +; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_xor_i16( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<4>; +; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; +; CHECK-SM100-NEXT: // implicit-def: %rs2 +; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; +; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; +; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; +; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res } From dfa1a0fc007b777176cfe00a5cc64d1f33cba6e0 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Wed, 18 Jun 2025 12:47:37 -0400 Subject: [PATCH 5/5] Reapply "[NVPTX] add combiner rule for final packed op in reduction" This reverts commit 9d84bd34f53646a6769331d40516e150a3e05bf2. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 114 +++++- .../CodeGen/NVPTX/reduction-intrinsics.ll | 340 ++++++------------ 2 files changed, 210 insertions(+), 244 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 492f4ab76fdbb..e2aa907ed8eb8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -852,6 +852,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, if (STI.allowFP16Math() || STI.hasBF16Math()) setTargetDAGCombine(ISD::SETCC); + // Combine reduction operations on packed types (e.g. fadd.f16x2) with vector + // shuffles when one of their lanes is a no-op. + if (STI.allowFP16Math() || STI.hasBF16Math()) + // already added above: FADD, ADD, AND + setTargetDAGCombine({ISD::FMUL, ISD::FMINIMUM, ISD::FMAXIMUM, ISD::UMIN, + ISD::UMAX, ISD::SMIN, ISD::SMAX, ISD::OR, ISD::XOR}); + // Promote fp16 arithmetic if fp16 hardware isn't available or the // user passed --nvptx-no-fp16-math. The flag is useful because, // although sm_53+ GPUs have some sort of FP16 support in @@ -5069,20 +5076,102 @@ static SDValue PerformStoreRetvalCombine(SDNode *N) { return PerformStoreCombineHelper(N, 2, 0); } +/// For vector reductions, the final result needs to be a scalar. The default +/// expansion will use packed ops (ex. fadd.f16x2) even for the final operation. +/// This requires a packed operation where one of the lanes is undef. +/// +/// ex: lowering of vecreduce_fadd(V) where V = v4f16 +/// +/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) +/// v2: v2f16 = vector_shuffle<1,u> v1, undef:v2f16 (== ) +/// v3: v2f16 = fadd reassoc v2, v1 (== ) +/// vR: f16 = extractelt v3, 1 +/// +/// We wish to replace vR, v3, and v2 with: +/// vR: f16 = fadd reassoc (extractelt v1, 1) (extractelt v1, 0) +/// +/// ...so that we get: +/// v1: v2f16 = fadd reassoc v2f16, v2f16 (== ) +/// s1: f16 = extractelt v1, 1 +/// s2: f16 = extractelt v1, 0 +/// vR: f16 = fadd reassoc s1, s2 (== a+c+b+d) +/// +/// So for this example, this rule will replace v3 and v2, returning a vector +/// with the result in lane 0 and an undef in lane 1, which we expect will be +/// folded into the extractelt in vR. +static SDValue PerformPackedOpCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Convert: + // (fop.x2 (vector_shuffle A), B) -> ((fop A:i, B:0), undef) + // ...or... + // (fop.x2 (vector_shuffle A), B) -> (undef, (fop A:i, B:1)) + // ...where i is a valid index and u is poison. + const EVT VectorVT = N->getValueType(0); + if (!Isv2x16VT(VectorVT)) + return SDValue(); + + SDLoc DL(N); + + SDValue ShufOp = N->getOperand(0); + SDValue VectOp = N->getOperand(1); + bool Swapped = false; + + // canonicalize shuffle to op0 + if (VectOp.getOpcode() == ISD::VECTOR_SHUFFLE) { + std::swap(ShufOp, VectOp); + Swapped = true; + } + + if (ShufOp.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *ShuffleOp = cast(ShufOp); + int LiveLane; // exclusively live lane + for (LiveLane = 0; LiveLane < 2; ++LiveLane) { + // check if the current lane is live and the other lane is dead + if (ShuffleOp->getMaskElt(LiveLane) != PoisonMaskElem && + ShuffleOp->getMaskElt(!LiveLane) == PoisonMaskElem) + break; + } + if (LiveLane == 2) + return SDValue(); + + int ElementIdx = ShuffleOp->getMaskElt(LiveLane); + const EVT ScalarVT = VectorVT.getScalarType(); + SDValue Lanes[2] = {}; + for (auto [LaneID, LaneVal] : enumerate(Lanes)) { + if (LaneID == (unsigned)LiveLane) { + SDValue Operands[2] = { + DCI.DAG.getExtractVectorElt(DL, ScalarVT, ShufOp.getOperand(0), + ElementIdx), + DCI.DAG.getExtractVectorElt(DL, ScalarVT, VectOp, LiveLane)}; + // preserve the order of operands + if (Swapped) + std::swap(Operands[0], Operands[1]); + LaneVal = DCI.DAG.getNode(N->getOpcode(), DL, ScalarVT, Operands); + } else { + LaneVal = DCI.DAG.getUNDEF(ScalarVT); + } + } + return DCI.DAG.getBuildVector(VectorVT, DL, Lanes); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel) { - if (OptLevel == CodeGenOptLevel::None) - return SDValue(); - SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Skip non-integer, non-scalar case EVT VT = N0.getValueType(); - if (VT.isVector() || VT != MVT::i32) + if (VT.isVector()) + return PerformPackedOpCombine(N, DCI); + if (VT != MVT::i32) + return SDValue(); + + if (OptLevel == CodeGenOptLevel::None) return SDValue(); // First try with the default operand order. @@ -5102,7 +5191,10 @@ static SDValue PerformFADDCombine(SDNode *N, SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); - if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64)) + if (VT.isVector()) + return PerformPackedOpCombine(N, DCI); + + if (!(VT == MVT::f32 || VT == MVT::f64)) return SDValue(); // First try with the default operand order. @@ -5205,7 +5297,7 @@ static SDValue PerformANDCombine(SDNode *N, DCI.CombineTo(N, Val, AddTo); } - return SDValue(); + return PerformPackedOpCombine(N, DCI); } static SDValue PerformREMCombine(SDNode *N, @@ -5686,6 +5778,16 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformADDCombine(N, DCI, OptLevel); case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); + case ISD::FMUL: + case ISD::FMINNUM: + case ISD::FMAXIMUM: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::OR: + case ISD::XOR: + return PerformPackedOpCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, OptLevel); case ISD::SHL: diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index d5b451dad7bc3..ca03550bdefcd 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -5,10 +5,10 @@ ; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_80 -mattr=+ptx70 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_80 %} -; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | FileCheck -check-prefixes CHECK,CHECK-SM100 %s -; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx87 -O0 \ +; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_100 -mattr=+ptx86 -O0 \ ; RUN: -disable-post-ra -verify-machineinstrs \ ; RUN: | %ptxas-verify -arch=sm_100 %} target triple = "nvptx64-nvidia-cuda" @@ -43,45 +43,22 @@ define half @reduce_fadd_half(<8 x half> %in) { } define half @reduce_fadd_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<6>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM80-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fadd_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<6>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; -; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: mov.b16 %rs4, 0x0000; -; CHECK-SM100-NEXT: add.rn.f16 %rs5, %rs3, %rs4; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs5; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fadd_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<6>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-NEXT: add.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: add.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: add.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: add.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: mov.b16 %rs4, 0x0000; +; CHECK-NEXT: add.rn.f16 %rs5, %rs3, %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fadd(half 0.0, <8 x half> %in) ret half %res } @@ -205,41 +182,20 @@ define half @reduce_fmul_half(<8 x half> %in) { } define half @reduce_fmul_half_reassoc(<8 x half> %in) { -; CHECK-SM80-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<10>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_fmul_half_reassoc( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<10>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: mul.rn.f16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: st.param.b16 [func_retval0], %rs3; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_fmul_half_reassoc( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-NEXT: mul.rn.f16x2 %r5, %r2, %r4; +; CHECK-NEXT: mul.rn.f16x2 %r6, %r1, %r3; +; CHECK-NEXT: mul.rn.f16x2 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %res = call reassoc half @llvm.vector.reduce.fmul(half 1.0, <8 x half> %in) ret half %res } @@ -401,7 +357,6 @@ define half @reduce_fmax_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmax_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -423,7 +378,6 @@ define float @reduce_fmax_float(<8 x float> %in) { } define float @reduce_fmax_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -445,7 +399,6 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { } define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fmax_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -533,7 +486,6 @@ define half @reduce_fmin_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmin_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmin_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -555,7 +507,6 @@ define float @reduce_fmin_float(<8 x float> %in) { } define float @reduce_fmin_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmin_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -665,7 +616,6 @@ define half @reduce_fmaximum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fmaximum_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -687,7 +637,6 @@ define float @reduce_fmaximum_float(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -709,7 +658,6 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { } define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fmaximum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -797,7 +745,6 @@ define half @reduce_fminimum_half_reassoc_nonpow2(<7 x half> %in) { ; Check straight-line reduction. define float @reduce_fminimum_float(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -819,7 +766,6 @@ define float @reduce_fminimum_float(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc(<8 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; @@ -841,7 +787,6 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { } define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { -; ; CHECK-LABEL: reduce_fminimum_float_reassoc_nonpow2( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<14>; @@ -888,20 +833,17 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_add_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: add.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: add.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.add(<8 x i16> %in) ret i16 %res @@ -1114,20 +1056,17 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umax(<8 x i16> %in) ret i16 %res @@ -1248,20 +1187,17 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_umin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.u16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.u16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.umin(<8 x i16> %in) ret i16 %res @@ -1382,20 +1318,17 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smax_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: max.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: max.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smax(<8 x i16> %in) ret i16 %res @@ -1516,20 +1449,17 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-LABEL: reduce_smin_i16( ; CHECK-SM100: { ; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; +; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: ; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: min.s16x2 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-SM100-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-SM100-NEXT: min.s16 %rs3, %rs1, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-SM100-NEXT: ret; %res = call i16 @llvm.vector.reduce.smin(<8 x i16> %in) ret i16 %res @@ -1625,43 +1555,21 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_and_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_and_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_and_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; -; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: and.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_and_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-NEXT: and.b32 %r5, %r2, %r4; +; CHECK-NEXT: and.b32 %r6, %r1, %r3; +; CHECK-NEXT: and.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.and(<8 x i16> %in) ret i16 %res } @@ -1736,43 +1644,21 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_or_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_or_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_or_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; -; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: or.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_or_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-NEXT: or.b32 %r5, %r2, %r4; +; CHECK-NEXT: or.b32 %r6, %r1, %r3; +; CHECK-NEXT: or.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: or.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.or(<8 x i16> %in) ret i16 %res } @@ -1847,43 +1733,21 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { } define i16 @reduce_xor_i16(<8 x i16> %in) { -; CHECK-SM80-LABEL: reduce_xor_i16( -; CHECK-SM80: { -; CHECK-SM80-NEXT: .reg .b16 %rs<4>; -; CHECK-SM80-NEXT: .reg .b32 %r<11>; -; CHECK-SM80-EMPTY: -; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; } -; CHECK-SM80-NEXT: // implicit-def: %rs2 -; CHECK-SM80-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM80-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM80-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r9; } -; CHECK-SM80-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM80-NEXT: ret; -; -; CHECK-SM100-LABEL: reduce_xor_i16( -; CHECK-SM100: { -; CHECK-SM100-NEXT: .reg .b16 %rs<4>; -; CHECK-SM100-NEXT: .reg .b32 %r<11>; -; CHECK-SM100-EMPTY: -; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; -; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; -; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-SM100-NEXT: mov.b32 {_, %rs1}, %r7; -; CHECK-SM100-NEXT: // implicit-def: %rs2 -; CHECK-SM100-NEXT: mov.b32 %r8, {%rs1, %rs2}; -; CHECK-SM100-NEXT: xor.b32 %r9, %r7, %r8; -; CHECK-SM100-NEXT: mov.b32 {%rs3, _}, %r9; -; CHECK-SM100-NEXT: cvt.u32.u16 %r10, %rs3; -; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r10; -; CHECK-SM100-NEXT: ret; +; CHECK-LABEL: reduce_xor_i16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-NEXT: xor.b32 %r5, %r2, %r4; +; CHECK-NEXT: xor.b32 %r6, %r1, %r3; +; CHECK-NEXT: xor.b32 %r7, %r6, %r5; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r7; +; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; %res = call i16 @llvm.vector.reduce.xor(<8 x i16> %in) ret i16 %res }