diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 5631342ecc13e..a39bf654ba5e8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -459,8 +459,17 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { // We only care about 16x2 as it's the only real vector type we // need to deal with. MVT VT = Vector.getSimpleValueType(); - if (!Isv2x16VT(VT)) + if (!isPackedVectorTy(VT) || VT.getVectorNumElements() != 2) return false; + + unsigned Opcode; + if (VT.is32BitVector()) + Opcode = NVPTX::I32toV2I16; + else if (VT.is64BitVector()) + Opcode = NVPTX::I64toV2I32; + else + llvm_unreachable("Unhandled packed type"); + // Find and record all uses of this vector that extract element 0 or 1. SmallVector E0, E1; for (auto *U : Vector.getNode()->users()) { @@ -484,11 +493,11 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { if (E0.empty() || E1.empty()) return false; - // Merge (f16 extractelt(V, 0), f16 extractelt(V,1)) - // into f16,f16 SplitF16x2(V) + // Merge (EltTy extractelt(V, 0), EltTy extractelt(V,1)) + // into EltTy,EltTy Split[EltTy]x2(V) MVT EltVT = VT.getVectorElementType(); SDNode *ScatterOp = - CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector); + CurDAG->getMachineNode(Opcode, SDLoc(N), EltVT, EltVT, Vector); for (auto *Node : E0) ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0)); for (auto *Node : E1) @@ -1004,6 +1013,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional Opcode_i8, case MVT::i32: case MVT::f32: return Opcode_i32; + case MVT::v2f32: case MVT::i64: case MVT::f64: return Opcode_i64; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d817a3c6a8777..0780b7a1fadd2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -195,11 +195,6 @@ static bool IsPTXVectorType(MVT VT) { } } -static bool Is16bitsType(MVT VT) { - return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || - VT.SimpleTy == MVT::i16); -} - // When legalizing vector loads/stores, this function is called, which does two // things: // 1. Determines Whether the vector is something we want to custom lower, @@ -222,6 +217,9 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { const MVT EltVT = VectorVT.getVectorElementType(); const unsigned NumElts = VectorVT.getVectorNumElements(); + // The size of the PTX virtual register that holds a packed type. + unsigned PackRegSize; + // We only handle "native" vector sizes for now, e.g. <4 x double> is not // legal. We can (and should) split that into 2 stores of <2 x double> here // but I'm leaving that as a TODO for now. @@ -231,7 +229,6 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { case MVT::v4i64: case MVT::v4f64: case MVT::v8i32: - case MVT::v8f32: // This is a "native" vector type iff the address space is global // and the target supports 256-bit loads/stores if (!CanLowerTo256Bit) @@ -240,10 +237,8 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { case MVT::v2i8: case MVT::v2i32: case MVT::v2i64: - case MVT::v2f32: case MVT::v2f64: case MVT::v4i32: - case MVT::v4f32: // This is a "native" vector type return std::pair(NumElts, EltVT); case MVT::v16f16: // <8 x f16x2> @@ -267,22 +262,26 @@ getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { case MVT::v8bf16: // <4 x bf16x2> case MVT::v8i16: // <4 x i16x2> case MVT::v16i8: // <4 x i8x4> - // This can be upsized into a "native" vector type. - // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for - // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use - // vectorized loads/stores with the actual element type for i8/i16 as that - // would require v8/v16 variants that do not exist. - // In order to load/store such vectors efficiently, here in Type - // Legalization, we split the vector into word-sized chunks (v2x16/v4i8). - // Later, we will lower to PTX as vectors of b32. + PackRegSize = 32; + break; + case MVT::v8f32: // <4 x f32x2> + if (!CanLowerTo256Bit) + return std::nullopt; + LLVM_FALLTHROUGH; + case MVT::v2f32: // <1 x f32x2> + case MVT::v4f32: // <2 x f32x2> + PackRegSize = 64; + break; + } - // Number of elements to pack in one word. - const unsigned NPerWord = 32 / EltVT.getSizeInBits(); + // If we reach here, then we can pack 2 or more elements into a single 32-bit + // or 64-bit PTX register and treat the vector as a new vector containing + // packed elements. - return std::pair(NumElts / NPerWord, MVT::getVectorVT(EltVT, NPerWord)); - } + // Number of elements to pack in one word. + const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits(); - llvm_unreachable("All cases in switch should return."); + return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg)); } /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive @@ -330,11 +329,8 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, } ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset); - for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { - EVT VT = TempVTs[i]; - uint64_t Off = TempOffsets[i]; - // Split vectors into individual elements, except for v2f16, which - // we will pass as a single scalar. + for (auto [VT, Off] : zip(TempVTs, TempOffsets)) { + // Split vectors into individual elements, except for packed types if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); @@ -342,10 +338,21 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized // vectors. - if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 && - isPowerOf2_32(NumElts)) { - // Vectors with an even number of f16 elements will be passed to - // us as an array of v2f16/v2bf16 elements. We must match this so we + + // Special case handling for packed i8s. + if (EltVT.getSimpleVT() == MVT::i8 && + ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) || NumElts == 3)) { + // v*i8 are formally lowered as v4i8 + EltVT = MVT::v4i8; + NumElts = (NumElts + 3) / 4; + } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { + // v2i8 is promoted to v2i16 + NumElts = 1; + EltVT = MVT::v2i8; + } else if (isPackedElementTy(EltVT) && NumElts % 2 == 0 && + isPowerOf2_32(NumElts)) { + // Vectors with an even number of elements will be passed to + // us as an array of pairs of 2 elements. We must match this so we // stay in sync with Ins/Outs. switch (EltVT.getSimpleVT().SimpleTy) { case MVT::f16: @@ -357,20 +364,13 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, case MVT::i16: EltVT = MVT::v2i16; break; + case MVT::f32: + EltVT = MVT::v2f32; + break; default: llvm_unreachable("Unexpected type"); } NumElts /= 2; - } else if (EltVT.getSimpleVT() == MVT::i8 && - ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) || - NumElts == 3)) { - // v*i8 are formally lowered as v4i8 - EltVT = MVT::v4i8; - NumElts = (NumElts + 3) / 4; - } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) { - // v2i8 is promoted to v2i16 - NumElts = 1; - EltVT = MVT::v2i8; } for (unsigned j = 0; j != NumElts; ++j) { ValueVTs.push_back(EltVT); @@ -601,6 +601,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass); addRegisterClass(MVT::bf16, &NVPTX::B16RegClass); addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass); + addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass); // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -637,6 +638,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); + // No support for these operations with v2f32. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); + // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -662,12 +667,16 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, - MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, - MVT::i32, MVT::i64}) { + MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, + MVT::v4i8, MVT::i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } + // Not directly supported. TLI would attempt to expand operations like + // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes. + setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); + // Some SIGN_EXTEND_INREG can be done using cvt instruction. // For others we will expand to a SHL/SRA pair. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); @@ -864,6 +873,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); + setOperationAction(Op, MVT::v2f32, + STI.hasF32x2Instructions() ? Legal : Expand); } // On SM80, we select add/mul/sub as fma to avoid promotion to float @@ -885,6 +896,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); + setOperationAction(ISD::FNEG, MVT::v2f32, Expand); // (would be) Library functions. // These map to conversion instructions for scalar FP types. @@ -895,6 +907,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(Op, MVT::f64, Legal); setOperationAction(Op, MVT::v2f16, Expand); setOperationAction(Op, MVT::v2bf16, Expand); + setOperationAction(Op, MVT::v2f32, Expand); setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); @@ -910,6 +923,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } } + // Expand v2f32 = fp_extend + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + // Expand v2[b]f16 = fp_round v2f32 + setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand); + // sm_80 only has conversions between f32 and bf16. Custom lower all other // bf16 conversions. if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { @@ -947,14 +965,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(Op, MVT::f16, Promote); setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); - setOperationAction(Op, MVT::v2f16, Expand); - setOperationAction(Op, MVT::v2bf16, Expand); + setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand); setOperationAction(Op, MVT::bf16, Promote); AddPromotedToType(Op, MVT::bf16, MVT::f32); } setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal); + setOperationAction(ISD::FABS, MVT::v2f32, Expand); if (STI.getPTXVersion() >= 65) { setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote); setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand); @@ -976,6 +994,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setBF16OperationAction(Op, MVT::bf16, Legal, Promote); if (getOperationAction(Op, MVT::bf16) == Promote) AddPromotedToType(Op, MVT::bf16, MVT::f32); + setOperationAction(Op, MVT::v2f32, Expand); } bool SupportsF32MinMaxNaN = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; @@ -985,6 +1004,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); setBF16OperationAction(Op, MVT::bf16, Legal, Expand); setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); + setOperationAction(Op, MVT::v2f32, Expand); } // Custom lowering for inline asm with 128-bit operands @@ -997,6 +1017,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // - bf16/bf16x2 (sm_90+, PTX 7.8+) // When f16/bf16 types aren't supported, they are promoted/expanded to f32. setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote); setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand); setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote); @@ -1008,7 +1029,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::FLOG2, MVT::f32, Legal); setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32); setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32); - setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16}, Expand); + setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, + Expand); } setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); @@ -2069,7 +2091,7 @@ SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op->getValueType(0); - if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) + if (!isPackedVectorTy(VT) || !VT.is32BitVector()) return Op; SDLoc DL(Op); @@ -2119,15 +2141,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, Value = Value.trunc(8); return Value.zext(32); }; - APInt Value; - if (Isv2x16VT(VT)) { - Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16); - } else if (VT == MVT::v4i8) { - Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) | - GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24); - } else { - llvm_unreachable("Unsupported type"); - } + + APInt Value(32, 0); + const unsigned NumElements = VT.getVectorNumElements(); + const unsigned ShiftAmount = 32 / NumElements; + for (unsigned ElementNo : seq(NumElements)) + Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount); SDValue Const = DAG.getConstant(Value, DL, MVT::i32); return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const); } @@ -2155,7 +2174,8 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return Op; // Extract individual elements and select one of them. - assert(Isv2x16VT(VectorVT) && "Unexpected vector type."); + assert(isPackedVectorTy(VectorVT) && VectorVT.getVectorNumElements() == 2 && + "Unexpected vector type."); EVT EltVT = VectorVT.getVectorElementType(); SDLoc dl(Op.getNode()); @@ -3064,22 +3084,27 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV)); } +static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, + SmallVectorImpl &Results, + const NVPTXSubtarget &STI); + SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::i1) return LowerLOADi1(Op, DAG); - // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle - // unaligned loads and have to handle it here. EVT VT = Op.getValueType(); - if (Isv2x16VT(VT) || VT == MVT::v4i8) { - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, SDLoc(Op)); - } + if (!isPackedVectorTy(VT)) + return SDValue(); + + // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to + // handle unaligned loads and have to handle it here. + LoadSDNode *Load = cast(Op); + EVT MemVT = Load->getMemoryVT(); + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, SDLoc(Op)); } return SDValue(); @@ -3115,17 +3140,19 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) return LowerSTOREi1(Op, DAG); - // v2f16 is legal, so we can't rely on legalizer to handle unaligned - // stores and have to handle it here. - if ((Isv2x16VT(VT) || VT == MVT::v4i8) && + // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to + // handle unaligned stores and have to handle it here. + if (isPackedVectorTy(VT) && !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); - // v2f16, v2bf16 and v2i16 don't need special handling. - if (Isv2x16VT(VT) || VT == MVT::v4i8) + // v2f16/v2bf16/v2i16 don't need special handling. + if (isPackedVectorTy(VT) && VT.is32BitVector()) return SDValue(); + // Lower store of any other vector type, including v2f32 as we want to break + // it apart since this is not a widely-supported type. return LowerSTOREVector(Op, DAG); } @@ -4923,7 +4950,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } -/// Fold extractelts into a load by increasing the number of return values. +/// Fold unpacking movs into a load by increasing the number of return values. /// /// ex: /// L: v2f16,ch = load

@@ -4932,6 +4959,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, /// use(a, b) /// /// ...is turned into... +/// /// L: f16,f16,ch = LoadV2

/// use(L:0, L:1) static SDValue @@ -4940,10 +4968,13 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (!DCI.isAfterLegalizeDAG()) return SDValue(); - EVT ElemVT = N->getValueType(0); - if (!Isv2x16VT(ElemVT)) + EVT ElementVT = N->getValueType(0); + // Avoid non-packed types and v4i8 + if (!isPackedVectorTy(ElementVT)) return SDValue(); + SmallVector DeadCopyToRegs; + // Check whether all outputs are either used by an extractelt or are // glue/chain nodes if (!all_of(N->uses(), [&](SDUse &U) { @@ -4971,6 +5002,12 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return !U.getUser()->use_empty(); } + // Handle CopyToReg nodes that will become dead after our replacement + if (U.getUser()->getOpcode() == ISD::CopyToReg) { + DeadCopyToRegs.push_back(U.getUser()); + return true; + } + // Otherwise, this use prevents us from splitting a value. return false; })) @@ -5003,6 +5040,13 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Opcode = NVPTXISD::LoadV4; break; case NVPTXISD::LoadV4: + // V8 is only supported for f32. Don't forget, we're not changing the load + // size here. This is already a 256-bit load. + if (ElementVT != MVT::v2f32) + return SDValue(); + OldNumOutputs = 4; + Opcode = NVPTXISD::LoadV8; + break; case NVPTXISD::LoadV8: // PTX doesn't support the next doubling of outputs return SDValue(); @@ -5010,7 +5054,7 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // the non-glue, non-chain outputs in the new load const unsigned NewNumOutputs = OldNumOutputs * 2; - SmallVector NewVTs(NewNumOutputs, ElemVT.getVectorElementType()); + SmallVector NewVTs(NewNumOutputs, ElementVT.getVectorElementType()); // add remaining chain and glue values NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end()); @@ -5025,23 +5069,28 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SmallVector Results; for (unsigned I : seq(OldNumOutputs)) Results.push_back(DCI.DAG.getBuildVector( - ElemVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)})); + ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)})); // Add remaining chain and glue nodes for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs)) Results.push_back(NewLoad.getValue(NewNumOutputs + I)); + // Remove dead CopyToReg nodes by folding them into the chain they reference + for (SDNode *CTR : DeadCopyToRegs) + DCI.CombineTo(CTR, CTR->getOperand(0)); + return DCI.DAG.getMergeValues(Results, DL); } -/// Fold a packing mov into a store. +/// Fold packing movs into a store. /// /// ex: -/// v: v2f16 = BUILD_VECTOR a:f16, b:f16 -/// StoreRetval v +/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16 +/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16 +/// StoreV2 v1, v2 /// /// ...is turned into... /// -/// StoreRetvalV2 a:f16, b:f16 +/// StoreV4 a, b, c, d static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back) { @@ -5053,7 +5102,8 @@ static SDValue combinePackingMovIntoStore(SDNode *N, // Get the type of the operands being stored. EVT ElementVT = N->getOperand(Front).getValueType(); - if (!Isv2x16VT(ElementVT)) + // Avoid non-packed types and v4i8 + if (!isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8) return SDValue(); auto *ST = cast(N); @@ -5080,6 +5130,12 @@ static SDValue combinePackingMovIntoStore(SDNode *N, Opcode = NVPTXISD::StoreV4; break; case NVPTXISD::StoreV4: + // V8 is only supported for f32. Don't forget, we're not changing the store + // size here. This is already a 256-bit store. + if (ElementVT != MVT::v2f32) + return SDValue(); + Opcode = NVPTXISD::StoreV8; + break; case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands @@ -5609,10 +5665,10 @@ static SDValue PerformEXTRACTCombine(SDNode *N, IsPTXVectorType(VectorVT.getSimpleVT())) return SDValue(); // Native vector loads already combine nicely w/ // extract_vector_elt. - // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already - // handle them OK. - if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || - VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) + // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8), + // we already handle them OK. + if (VectorVT.getVectorNumElements() == 1 || isPackedVectorTy(VectorVT) || + VectorVT == MVT::v8i8) return SDValue(); // Don't mess with undef values as sra may be simplified to 0, not undef. @@ -5685,7 +5741,10 @@ static SDValue PerformVSELECTCombine(SDNode *N, static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto VT = N->getValueType(0); - if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT)) + if (!DCI.isAfterLegalizeDAG() || + // only process v2*16 types + !(isPackedVectorTy(VT) && VT.is32BitVector() && + VT.getVectorNumElements() == 2)) return SDValue(); auto Op0 = N->getOperand(0); @@ -5825,7 +5884,7 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, } /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. -static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, +static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results, const NVPTXSubtarget &STI) { LoadSDNode *LD = cast(N); @@ -6149,7 +6208,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( ReplaceBITCAST(N, DAG, Results); return; case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results, STI); + replaceLoadVector(N, DAG, Results, STI); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 1a2515b7f66f3..f7200c7bef11c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -151,6 +151,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; def True : Predicate<"true">; @@ -220,6 +221,7 @@ def BF16RT : RegTyInfo; def F16X2RT : RegTyInfo; def BF16X2RT : RegTyInfo; +def F32X2RT : RegTyInfo; // This class provides a basic wrapper around an NVPTXInst that abstracts the @@ -451,6 +453,18 @@ multiclass F3 { [(set f16:$dst, (op_pat f16:$a, f16:$b))]>, Requires<[useFP16Math]>; + def f32x2rr_ftz : + BasicNVPTXInst<(outs B64:$dst), + (ins B64:$a, B64:$b), + op_str # ".ftz.f32x2", + [(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>, + Requires<[hasF32x2Instructions, doF32FTZ]>; + def f32x2rr : + BasicNVPTXInst<(outs B64:$dst), + (ins B64:$a, B64:$b), + op_str # ".f32x2", + [(set v2f32:$dst, (op_pat v2f32:$a, v2f32:$b))]>, + Requires<[hasF32x2Instructions]>; def f16x2rr_ftz : BasicNVPTXInst<(outs B32:$dst), (ins B32:$a, B32:$b), @@ -829,6 +843,9 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b32rr $a, $b, $p)>; } +def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)), + (SELP_b64rr $a, $b, $p)>; + //----------------------------------- // Test Instructions //----------------------------------- @@ -1345,6 +1362,8 @@ defm BFMA16 : FMA<"fma.rn.bf16", BF16RT, [hasBF16Math]>; defm BFMA16x2 : FMA<"fma.rn.bf16x2", BF16X2RT, [hasBF16Math]>; defm FMA32_ftz : FMA<"fma.rn.ftz.f32", F32RT, [doF32FTZ]>; defm FMA32 : FMA<"fma.rn.f32", F32RT>; +defm FMA32x2_ftz : FMA<"fma.rn.ftz.f32x2", F32X2RT, [hasF32x2Instructions, doF32FTZ]>; +defm FMA32x2 : FMA<"fma.rn.f32x2", F32X2RT, [hasF32x2Instructions]>; defm FMA64 : FMA<"fma.rn.f64", F64RT>; // sin/cos @@ -2585,6 +2604,7 @@ def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H $s)>; def: Pat<(i32 (sext (extractelt v2i16:$src, 0))), (CVT_INREG_s32_s16 $src)>; +// Handle extracting one element from the pair (32-bit types) foreach vt = [v2f16, v2bf16, v2i16] in { def : Pat<(extractelt vt:$src, 0), (I32toI16L_Sink $src)>, Requires<[hasPTX<71>]>; def : Pat<(extractelt vt:$src, 1), (I32toI16H_Sink $src)>, Requires<[hasPTX<71>]>; @@ -2596,10 +2616,21 @@ foreach vt = [v2f16, v2bf16, v2i16] in { (V2I16toI32 $a, $b)>; } +// Same thing for the 64-bit type v2f32. +foreach vt = [v2f32] in { + def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>; + def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>; + + def : Pat<(extractelt vt:$src, 0), (I64toI32L $src)>; + def : Pat<(extractelt vt:$src, 1), (I64toI32H $src)>; + + def : Pat<(vt (build_vector vt.ElementType:$a, vt.ElementType:$b)), + (V2I32toI64 $a, $b)>; +} + def: Pat<(v2i16 (scalar_to_vector i16:$a)), (CVT_u32_u16 $a, CvtNONE)>; - def nvptx_build_vector : SDNode<"NVPTXISD::BUILD_VECTOR", SDTypeProfile<1, 2, []>, []>; def : Pat<(i64 (nvptx_build_vector i32:$a, i32:$b)), diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index 9fac97d97c609..f539f9030c481 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -60,7 +60,8 @@ def B16 : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>; def B32 : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; -def B64 : NVPTXRegClass<[i64, f64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; +def B64 : NVPTXRegClass<[i64, v2f32, f64], 64, (add (sequence "RL%u", 0, 4), + VRFrame64, VRFrameLocal64)>; // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. def B128 : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 8810feaee297a..81af55edccadb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -116,6 +116,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return HasTcgen05 && PTXVersion >= 86; } + // f32x2 instructions in Blackwell family + bool hasF32x2Instructions() const { + return SmVersion >= 100 && PTXVersion >= 86; + } // TMA G2S copy with cta_group::1/2 support bool hasCpAsyncBulkTensorCTAGroupSupport() const { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index aa7850acbd64a..b901138a87170 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -129,8 +129,9 @@ class NVPTXTTIImpl final : public BasicTTIImplBase { Insert = false; } } - if (Insert && Isv2x16VT(VT)) { - // Can be built in a single mov + if (Insert && isPackedVectorTy(VT) && VT.is32BitVector()) { + // Can be built in a single 32-bit mov (64-bit regs are emulated in SASS + // with 2x 32-bit regs) Cost += 1; Insert = false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index e792e441e49e6..103e67061b806 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -85,8 +85,14 @@ inline unsigned promoteScalarArgumentSize(unsigned size) { bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM); -inline bool Isv2x16VT(EVT VT) { - return (VT == MVT::v2f16 || VT == MVT::v2bf16 || VT == MVT::v2i16); +inline bool isPackedVectorTy(EVT VT) { + return (VT == MVT::v4i8 || VT == MVT::v2f16 || VT == MVT::v2bf16 || + VT == MVT::v2i16 || VT == MVT::v2f32); +} + +inline bool isPackedElementTy(EVT VT) { + return (VT == MVT::i8 || VT == MVT::f16 || VT == MVT::bf16 || + VT == MVT::i16 || VT == MVT::f32); } inline bool shouldPassAsArray(Type *Ty) { diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index 1c8f019922e37..bb55545f43718 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} @@ -7,57 +8,105 @@ declare [2 x float] @bara([2 x float] %input) declare {float, float} @bars({float, float} %input) define void @test_v2f32(<2 x float> %input, ptr %output) { -; CHECK-LABEL: @test_v2f32 +; CHECK-LABEL: test_v2f32( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), barv, (param0); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: ld.param.b64 %rd4, [test_v2f32_param_1]; +; CHECK-NEXT: st.b64 [%rd4], %rd2; +; CHECK-NEXT: ret; %call = tail call <2 x float> @barv(<2 x float> %input) -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0]; store <2 x float> %call, ptr %output, align 8 -; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } define void @test_v3f32(<3 x float> %input, ptr %output) { -; CHECK-LABEL: @test_v3f32 -; +; CHECK-LABEL: test_v3f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .align 16 .b8 param0[16]; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; +; CHECK-NEXT: st.param.b32 [param0+8], %r3; +; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: call.uni (retval0), barv3, (param0); +; CHECK-NEXT: ld.param.v2.b32 {%r4, %r5}, [retval0]; +; CHECK-NEXT: ld.param.b32 %r6, [retval0+8]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_1]; +; CHECK-NEXT: st.b32 [%rd1+8], %r6; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r4, %r5}; +; CHECK-NEXT: ret; %call = tail call <3 x float> @barv3(<3 x float> %input) -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [retval0+8]; ; Make sure we don't load more values than than we need to. -; CHECK-NOT: ld.param.b32 [[E3:%r[0-9]+]], [retval0+12]; store <3 x float> %call, ptr %output, align 8 -; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8], -; -- This is suboptimal. We should do st.v2.f32 instead -; of combining 2xf32 info i64. -; CHECK-DAG: st.b64 [{{%rd[0-9]}}], -; CHECK: ret; ret void } define void @test_a2f32([2 x float] %input, ptr %output) { -; CHECK-LABEL: @test_a2f32 +; CHECK-LABEL: test_a2f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_a2f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_a2f32_param_0+4]; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[8]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: st.param.b32 [param0+4], %r2; +; CHECK-NEXT: .param .align 4 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), bara, (param0); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0+4]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: ld.param.b64 %rd1, [test_a2f32_param_1]; +; CHECK-NEXT: st.b32 [%rd1+4], %r4; +; CHECK-NEXT: st.b32 [%rd1], %r3; +; CHECK-NEXT: ret; %call = tail call [2 x float] @bara([2 x float] %input) -; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.b32 [[ELEMA1:%r[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.b32 [[ELEMA2:%r[0-9]+]], [retval0+4]; store [2 x float] %call, ptr %output, align 4 -; CHECK: } -; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]] -; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]] ret void -; CHECK: ret } define void @test_s2f32({float, float} %input, ptr %output) { -; CHECK-LABEL: @test_s2f32 +; CHECK-LABEL: test_s2f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_s2f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_s2f32_param_0+4]; +; CHECK-NEXT: { // callseq 3, 0 +; CHECK-NEXT: .param .align 4 .b8 param0[8]; +; CHECK-NEXT: st.param.b32 [param0], %r1; +; CHECK-NEXT: st.param.b32 [param0+4], %r2; +; CHECK-NEXT: .param .align 4 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), bars, (param0); +; CHECK-NEXT: ld.param.b32 %r3, [retval0]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0+4]; +; CHECK-NEXT: } // callseq 3 +; CHECK-NEXT: ld.param.b64 %rd1, [test_s2f32_param_1]; +; CHECK-NEXT: st.b32 [%rd1+4], %r4; +; CHECK-NEXT: st.b32 [%rd1], %r3; +; CHECK-NEXT: ret; %call = tail call {float, float} @bars({float, float} %input) -; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.b32 [[ELEMS1:%r[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.b32 [[ELEMS2:%r[0-9]+]], [retval0+4]; store {float, float} %call, ptr %output, align 4 -; CHECK: } -; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]] -; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]] ret void -; CHECK: ret } diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index f59f51c3c57d3..28ebfb7fd6ff8 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -688,25 +688,25 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM70-NEXT: cvt.u32.u16 %r5, %rs8; +; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM70-NEXT: cvt.u32.u16 %r5, %rs2; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: cvt.u32.u16 %r7, %rs7; +; SM70-NEXT: cvt.u32.u16 %r7, %rs1; ; SM70-NEXT: shl.b32 %r8, %r7, 16; -; SM70-NEXT: cvt.u32.u16 %r9, %rs6; +; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM70-NEXT: cvt.u32.u16 %r9, %rs4; ; SM70-NEXT: shl.b32 %r10, %r9, 16; -; SM70-NEXT: cvt.u32.u16 %r11, %rs5; +; SM70-NEXT: cvt.u32.u16 %r11, %rs3; ; SM70-NEXT: shl.b32 %r12, %r11, 16; -; SM70-NEXT: cvt.u32.u16 %r13, %rs4; +; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; SM70-NEXT: cvt.u32.u16 %r13, %rs6; ; SM70-NEXT: shl.b32 %r14, %r13, 16; -; SM70-NEXT: cvt.u32.u16 %r15, %rs3; +; SM70-NEXT: cvt.u32.u16 %r15, %rs5; ; SM70-NEXT: shl.b32 %r16, %r15, 16; -; SM70-NEXT: cvt.u32.u16 %r17, %rs2; +; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; SM70-NEXT: cvt.u32.u16 %r17, %rs8; ; SM70-NEXT: shl.b32 %r18, %r17, 16; -; SM70-NEXT: cvt.u32.u16 %r19, %rs1; +; SM70-NEXT: cvt.u32.u16 %r19, %rs7; ; SM70-NEXT: shl.b32 %r20, %r19, 16; ; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%r20, %r18, %r16, %r14}; ; SM70-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r10, %r8, %r6}; @@ -721,18 +721,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-NEXT: cvt.f32.bf16 %r5, %rs8; -; SM80-NEXT: cvt.f32.bf16 %r6, %rs7; -; SM80-NEXT: cvt.f32.bf16 %r7, %rs6; -; SM80-NEXT: cvt.f32.bf16 %r8, %rs5; -; SM80-NEXT: cvt.f32.bf16 %r9, %rs4; -; SM80-NEXT: cvt.f32.bf16 %r10, %rs3; -; SM80-NEXT: cvt.f32.bf16 %r11, %rs2; -; SM80-NEXT: cvt.f32.bf16 %r12, %rs1; +; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-NEXT: cvt.f32.bf16 %r5, %rs2; +; SM80-NEXT: cvt.f32.bf16 %r6, %rs1; +; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM80-NEXT: cvt.f32.bf16 %r7, %rs4; +; SM80-NEXT: cvt.f32.bf16 %r8, %rs3; +; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; SM80-NEXT: cvt.f32.bf16 %r9, %rs6; +; SM80-NEXT: cvt.f32.bf16 %r10, %rs5; +; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; SM80-NEXT: cvt.f32.bf16 %r11, %rs8; +; SM80-NEXT: cvt.f32.bf16 %r12, %rs7; ; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-NEXT: ret; @@ -746,18 +746,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs8; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs7; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs6; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs5; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs4; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs3; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs2; -; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs1; +; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r5, %rs2; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r6, %rs1; +; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r7, %rs4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r8, %rs3; +; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r9, %rs6; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r10, %rs5; +; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r11, %rs8; +; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %r12, %rs7; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM80-FTZ-NEXT: ret; @@ -771,18 +771,18 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; SM90-NEXT: cvt.f32.bf16 %r5, %rs8; -; SM90-NEXT: cvt.f32.bf16 %r6, %rs7; -; SM90-NEXT: cvt.f32.bf16 %r7, %rs6; -; SM90-NEXT: cvt.f32.bf16 %r8, %rs5; -; SM90-NEXT: cvt.f32.bf16 %r9, %rs4; -; SM90-NEXT: cvt.f32.bf16 %r10, %rs3; -; SM90-NEXT: cvt.f32.bf16 %r11, %rs2; -; SM90-NEXT: cvt.f32.bf16 %r12, %rs1; +; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; SM90-NEXT: cvt.f32.bf16 %r5, %rs2; +; SM90-NEXT: cvt.f32.bf16 %r6, %rs1; +; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; SM90-NEXT: cvt.f32.bf16 %r7, %rs4; +; SM90-NEXT: cvt.f32.bf16 %r8, %rs3; +; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; SM90-NEXT: cvt.f32.bf16 %r9, %rs6; +; SM90-NEXT: cvt.f32.bf16 %r10, %rs5; +; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; SM90-NEXT: cvt.f32.bf16 %r11, %rs8; +; SM90-NEXT: cvt.f32.bf16 %r12, %rs7; ; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9}; ; SM90-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index ba5813c869236..e2a914d8cfc36 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -359,12 +359,11 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x bfloat> ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 43a605f2b34d7..0e98dc0007e35 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -45,12 +45,11 @@ define <2 x half> @test_ret_const() #0 { define half @test_extract_0(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_0( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 0 @@ -60,13 +59,12 @@ define half @test_extract_0(<2 x half> %a) #0 { define half @test_extract_1(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_1( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } -; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 1 ret half %e @@ -82,9 +80,8 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; @@ -110,16 +107,14 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fadd_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NOF16-NEXT: add.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -148,8 +143,7 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -181,8 +175,7 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_1_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -214,16 +207,14 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fsub_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fsub_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: sub.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NOF16-NEXT: sub.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -251,8 +242,7 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: mov.b32 %r3, 0f00000000; ; CHECK-NOF16-NEXT: sub.rn.f32 %r4, %r3, %r2; @@ -285,16 +275,14 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmul_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmul_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: mul.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NOF16-NEXT: mul.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -311,16 +299,14 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; +; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NEXT: div.rn.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -345,12 +331,10 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_frem_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_frem_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0]; +; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.f32 %r7, %r6; @@ -358,8 +342,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: testp.infinite.f32 %p1, %r3; ; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r9; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs1; ; CHECK-NEXT: div.rn.f32 %r12, %r11, %r10; ; CHECK-NEXT: cvt.rzi.f32.f32 %r13, %r12; ; CHECK-NEXT: neg.f32 %r14, %r13; @@ -551,13 +535,11 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; -; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; CHECK-F16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4; -; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-F16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-F16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; -; CHECK-F16-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; CHECK-F16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1]; +; CHECK-F16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; +; CHECK-F16-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; ; CHECK-F16-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-F16-NEXT: ret; ; @@ -568,22 +550,18 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-NOF16-NEXT: .reg .b32 %r<9>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; -; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs5; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs6; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; -; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r2; -; CHECK-NOF16-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; CHECK-NOF16-NEXT: ret; %cc = fcmp une <2 x half> %c, %d @@ -596,15 +574,16 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .pred %p<3>; ; CHECK-F16-NEXT: .reg .b32 %r<9>; +; CHECK-F16-NEXT: .reg .b64 %rd<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; -; CHECK-F16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; -; CHECK-F16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r5, %r6; -; CHECK-F16-NEXT: selp.f32 %r7, %r2, %r4, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r1, %r3, %p1; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: selp.f32 %r7, %r4, %r6, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r3, %r5, %p1; ; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-F16-NEXT: ret; ; @@ -613,22 +592,21 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .pred %p<3>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<5>; ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: ld.param.b32 %r6, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: ld.param.b32 %r5, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r8, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs4; -; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r10, %r9; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r2, %r4, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r1, %r3, %p1; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r4, %r10, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r3, %r9, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { @@ -643,18 +621,17 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0]; ; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; ; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; ; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; -; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; +; CHECK-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-NEXT: ret; <2 x float> %c, <2 x float> %d) #0 { @@ -687,15 +664,13 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_une_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_une_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -730,15 +705,13 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ueq_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ueq_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.equ.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.equ.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -773,15 +746,13 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ugt_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ugt_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -816,15 +787,13 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uge_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uge_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.geu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.geu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -859,15 +828,13 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ult_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ult_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -902,15 +869,13 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ule_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ule_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.leu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.leu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -946,15 +911,13 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uno_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uno_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -989,15 +952,13 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_one_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_one_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.ne.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.ne.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1032,15 +993,13 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oeq_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oeq_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.eq.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1075,15 +1034,13 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ogt_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ogt_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.gt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1118,15 +1075,13 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oge_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oge_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.ge.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.ge.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1161,15 +1116,13 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_olt_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_olt_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.lt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1204,15 +1157,13 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ole_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ole_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.le.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.le.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1247,15 +1198,13 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ord_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ord_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NOF16-NEXT: setp.num.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; ; CHECK-NOF16-NEXT: setp.num.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1273,8 +1222,7 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i32_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0]; ; CHECK-NEXT: cvt.rzi.s32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.s32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1291,8 +1239,7 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i64_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0]; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1308,8 +1255,7 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi32_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0]; ; CHECK-NEXT: cvt.rzi.u32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.u32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1326,8 +1272,7 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi64_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0]; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1424,17 +1369,16 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1467,17 +1411,16 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; -; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1490,15 +1433,11 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; -; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; -; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %r @@ -1529,8 +1468,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0]; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.f32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1547,8 +1485,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xdouble_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0]; ; CHECK-NEXT: cvt.f64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.f64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1641,8 +1578,7 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0]; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1670,8 +1606,7 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0]; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sin.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1692,8 +1627,7 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0]; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cos.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1769,20 +1703,17 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; -; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -1809,8 +1740,7 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: abs.f32 %r3, %r2; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1831,16 +1761,14 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0]; +; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NEXT: min.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NEXT: min.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1857,16 +1785,14 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0]; +; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1]; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; ; CHECK-NEXT: max.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; ; CHECK-NEXT: max.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1896,15 +1822,13 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs5, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs1, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs4, 32767; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1]; +; CHECK-NOF16-NEXT: and.b16 %rs5, %rs4, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs2, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs3, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs1, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs7}; ; CHECK-NOF16-NEXT: ret; @@ -1917,10 +1841,11 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16: { ; CHECK-F16-NEXT: .reg .b16 %rs<3>; ; CHECK-F16-NEXT: .reg .b32 %r<8>; +; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; ; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; @@ -1934,19 +1859,19 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16: { ; CHECK-NOF16-NEXT: .reg .b16 %rs<9>; ; CHECK-NOF16-NEXT: .reg .b32 %r<6>; +; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0]; ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r4; } -; CHECK-NOF16-NEXT: mov.b32 {%rs2, %rs3}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs4, %rs3, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs5, %rs4, %rs1; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; } +; CHECK-NOF16-NEXT: or.b16 %rs5, %rs3, %rs4; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; ; CHECK-NOF16-NEXT: and.b32 %r5, %r2, -2147483648; -; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs6}, %r5; } -; CHECK-NOF16-NEXT: and.b16 %rs7, %rs2, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs8, %rs7, %rs6; +; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; } +; CHECK-NOF16-NEXT: or.b16 %rs8, %rs6, %rs7; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs8, %rs5}; ; CHECK-NOF16-NEXT: ret; %tb = fptrunc <2 x float> %b to <2 x half> @@ -1981,8 +1906,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0]; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b64 %rd3, %rd2, -9223372036854775808; ; CHECK-NOF16-NEXT: shr.u64 %rd4, %rd3, 48; @@ -2024,15 +1948,13 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NOF16-NEXT: and.b16 %rs3, %rs1, -32768; -; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs4, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs2, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs5, 32767; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1]; +; CHECK-NOF16-NEXT: and.b16 %rs5, %rs3, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs4, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs2, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7; @@ -2050,8 +1972,7 @@ define <2 x half> @test_floor(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_floor_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0]; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2067,8 +1988,7 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0]; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2084,8 +2004,7 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0]; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2101,8 +2020,7 @@ define <2 x half> @test_rint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_rint_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0]; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2118,8 +2036,7 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_nearbyint_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0]; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2135,8 +2052,7 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_roundeven_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0]; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2154,8 +2070,7 @@ define <2 x half> @test_round(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<21>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0]; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; ; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; @@ -2206,20 +2121,17 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; -; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; -; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; -; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; -; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; -; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; +; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1]; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -2236,8 +2148,7 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> @@ -2247,13 +2158,12 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 { ; CHECK-LABEL: test_insertelement( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } +; CHECK-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %i = insertelement <2 x half> %a, half %x, i64 1 @@ -2267,8 +2177,7 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0]; ; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2284,8 +2193,7 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0]; ; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll new file mode 100644 index 0000000000000..0f72980f480b1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -0,0 +1,1962 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; ## Full FP32x2 support enabled by default. +; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-NOF32X2 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_80 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_80 \ +; RUN: %} +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-F32X2 %s +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "nvptx64-nvidia-cuda" + +define <2 x float> @test_ret_const() #0 { +; CHECK-LABEL: test_ret_const( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {0f3F800000, 0f40000000}; +; CHECK-NEXT: ret; + ret <2 x float> +} + +define float @test_extract_0(<2 x float> %a) #0 { +; CHECK-LABEL: test_extract_0( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %e = extractelement <2 x float> %a, i32 0 + ret float %e +} + +define float @test_extract_1(<2 x float> %a) #0 { +; CHECK-LABEL: test_extract_1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %e = extractelement <2 x float> %a, i32 1 + ret float %e +} + +; NOTE: disabled as -O3 miscompiles this into pointer arithmetic on +; test_extract_i_param_0 where the symbol's address is not taken first (that +; is, moved to a temporary) +; define float @test_extract_i(<2 x float> %a, i64 %idx) #0 { +; %e = extractelement <2 x float> %a, i64 %idx +; ret float %e +; } + +define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-NOF32X2-LABEL: test_fadd( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_0( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_0( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> , %a + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_1( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_1( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> %a, + ret <2 x float> %r +} + +define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-NOF32X2-LABEL: test_fadd_v4( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_v4( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_0_v4( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> , %a + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_1_v4( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> %a, + ret <4 x float> %r +} + +define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-NOF32X2-LABEL: test_fsub( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1]; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fsub( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; +; CHECK-F32X2-NEXT: sub.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fsub <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fneg(<2 x float> %a) #0 { +; CHECK-LABEL: test_fneg( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NEXT: neg.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fneg <2 x float> %a + ret <2 x float> %r +} + +define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-NOF32X2-LABEL: test_fmul( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1]; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fmul( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; +; CHECK-F32X2-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fmul <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fdiv( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; +; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %r = fdiv <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_frem( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; +; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NEXT: neg.f32 %r7, %r6; +; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; +; CHECK-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; +; CHECK-NEXT: neg.f32 %r12, %r11; +; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; +; CHECK-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NEXT: ret; + %r = frem <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_0_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_0_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> , %a + ret <2 x float> %r +} + +define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_1_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_1_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fadd <2 x float> %a, + ret <2 x float> %r +} + +define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_v4_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<13>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_v4_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd5, %rd2, %rd4; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd3; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd5}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> %a, %b + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_0_v4_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> , %a + ret <4 x float> %r +} + +define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { +; CHECK-NOF32X2-LABEL: test_fadd_imm_1_v4_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<7>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-F32X2-NEXT: mov.b32 %r1, 0f40800000; +; CHECK-F32X2-NEXT: mov.b32 %r2, 0f40400000; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: mov.b32 %r3, 0f40000000; +; CHECK-F32X2-NEXT: mov.b32 %r4, 0f3F800000; +; CHECK-F32X2-NEXT: mov.b64 %rd5, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.ftz.f32x2 %rd6, %rd1, %rd5; +; CHECK-F32X2-NEXT: st.param.v2.b64 [func_retval0], {%rd6, %rd4}; +; CHECK-F32X2-NEXT: ret; + %r = fadd <4 x float> %a, + ret <4 x float> %r +} + +define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-NOF32X2-LABEL: test_fsub_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1]; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fsub_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; +; CHECK-F32X2-NEXT: sub.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fsub <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { +; CHECK-LABEL: test_fneg_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NEXT: neg.ftz.f32 %r3, %r2; +; CHECK-NEXT: neg.ftz.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fneg <2 x float> %a + ret <2 x float> %r +} + +define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-NOF32X2-LABEL: test_fmul_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<7>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1]; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fmul_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; +; CHECK-F32X2-NEXT: mul.rn.ftz.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %r = fmul <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) #2 { +; CHECK-NOF32X2-LABEL: test_fma_ftz( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2]; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fma_ftz( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; +; CHECK-F32X2-NEXT: fma.rn.ftz.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-NEXT: ret; + %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_fdiv_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %r = fdiv <2 x float> %a, %b + ret <2 x float> %r +} + +define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { +; CHECK-LABEL: test_frem_ftz( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; +; CHECK-NEXT: neg.ftz.f32 %r7, %r6; +; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; +; CHECK-NEXT: testp.infinite.f32 %p1, %r4; +; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; +; CHECK-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; +; CHECK-NEXT: neg.ftz.f32 %r12, %r11; +; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; +; CHECK-NEXT: testp.infinite.f32 %p2, %r3; +; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; +; CHECK-NEXT: ret; + %r = frem <2 x float> %a, %b + ret <2 x float> %r +} + +define void @test_ldst_v2f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v2f32( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f32_param_0]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NEXT: st.b64 [%rd2], %rd3; +; CHECK-NEXT: ret; + %t1 = load <2 x float>, ptr %a + store <2 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v3f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v3f32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f32_param_0]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+8]; +; CHECK-NEXT: st.b32 [%rd2+8], %r1; +; CHECK-NEXT: st.b64 [%rd2], %rd3; +; CHECK-NEXT: ret; + %t1 = load <3 x float>, ptr %a + store <3 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v4f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v4f32( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f32_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-NEXT: ret; + %t1 = load <4 x float>, ptr %a + store <4 x float> %t1, ptr %b, align 32 + ret void +} + +define void @test_ldst_v8f32(ptr %a, ptr %b) #0 { +; CHECK-LABEL: test_ldst_v8f32( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f32_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f32_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd3, %rd4}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%rd5, %rd6}, [%rd1+16]; +; CHECK-NEXT: st.v2.b64 [%rd2+16], {%rd5, %rd6}; +; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; +; CHECK-NEXT: ret; + %t1 = load <8 x float>, ptr %a + store <8 x float> %t1, ptr %b, align 32 + ret void +} + +declare <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) #0 + +define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_call( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_call_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd2; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @test_callee(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_call_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_call_flipped_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_tailcall_flipped( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_tailcall_flipped_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .align 8 .b8 param0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = tail call <2 x float> @test_callee(<2 x float> %b, <2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_select(<2 x float> %a, <2 x float> %b, i1 zeroext %c) #0 { +; CHECK-LABEL: test_select( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_param_0]; +; CHECK-NEXT: selp.b64 %rd3, %rd1, %rd2, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = select i1 %c, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) #0 { +; CHECK-LABEL: test_select_cc( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<11>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3]; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1]; +; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p2; +; CHECK-NEXT: selp.f32 %r10, %r1, %r7, %p1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 x float> %c, <2 x float> %d) #0 { +; CHECK-LABEL: test_select_cc_f64_f32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; +; CHECK-NEXT: setp.neu.f32 %p1, %r1, %r3; +; CHECK-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; +; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x double> %a, <2 x double> %b + ret <2 x double> %r +} + +define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x double> %c, <2 x double> %d) #0 { +; CHECK-LABEL: test_select_cc_f32_f64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; +; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; +; CHECK-NEXT: selp.f32 %r5, %r2, %r4, %p2; +; CHECK-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %cc = fcmp une <2 x double> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_une( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; +; CHECK-NEXT: setp.neu.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.neu.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp une <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ueq( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; +; CHECK-NEXT: setp.equ.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.equ.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ueq <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ugt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; +; CHECK-NEXT: setp.gtu.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.gtu.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ugt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_uge( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; +; CHECK-NEXT: setp.geu.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.geu.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp uge <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ult( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; +; CHECK-NEXT: setp.ltu.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.ltu.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ult <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ule( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; +; CHECK-NEXT: setp.leu.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.leu.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ule <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_uno( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; +; CHECK-NEXT: setp.nan.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.nan.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp uno <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_one( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; +; CHECK-NEXT: setp.ne.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.ne.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp one <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_oeq( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; +; CHECK-NEXT: setp.eq.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.eq.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp oeq <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ogt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; +; CHECK-NEXT: setp.gt.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.gt.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ogt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_oge( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; +; CHECK-NEXT: setp.ge.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.ge.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp oge <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_olt( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; +; CHECK-NEXT: setp.lt.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.lt.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp olt <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ole( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; +; CHECK-NEXT: setp.le.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.le.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ole <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_fcmp_ord( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; +; CHECK-NEXT: setp.num.f32 %p1, %r2, %r4; +; CHECK-NEXT: setp.num.f32 %p2, %r1, %r3; +; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; +; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rs2; +; CHECK-NEXT: ret; + %r = fcmp ord <2 x float> %a, %b + ret <2 x i1> %r +} + +define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptosi_i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fptosi <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptosi_i64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fptosi <2 x float> %a to <2 x i64> + ret <2 x i64> %r +} + +define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = fptoui <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fptoui <2 x float> %a to <2 x i64> + ret <2 x i64> %r +} + +define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = uitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f32.u64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.u64 %r2, %rd1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NEXT: ret; + %r = uitofp <2 x i64> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = sitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_sitofp_2xi64(<2 x i64> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0]; +; CHECK-NEXT: cvt.rn.f32.s64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.s64 %r2, %rd1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NEXT: ret; + %r = sitofp <2 x i64> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { +; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_fadd( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_uitofp_2xi32_fadd( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: ret; + %c = uitofp <2 x i32> %a to <2 x float> + %r = fadd <2 x float> %b, %c + ret <2 x float> %r +} + +define <2 x float> @test_fptrunc_2xdouble(<2 x double> %a) #0 { +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NEXT: cvt.rn.f32.f64 %r1, %rd2; +; CHECK-NEXT: cvt.rn.f32.f64 %r2, %rd1; +; CHECK-NEXT: mov.b64 %rd3, {%r2, %r1}; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; + %r = fptrunc <2 x double> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; +; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; + %r = fpext <2 x float> %a to <2 x double> + ret <2 x double> %r +} + +define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { +; CHECK-LABEL: test_bitcast_2xfloat_to_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_2xi32_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = bitcast <2 x float> %a to <2 x i32> + ret <2 x i32> %r +} + +define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NEXT: ret; + %r = bitcast <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_bitcast_double_to_2xfloat(double %a) #0 { +; CHECK-LABEL: test_bitcast_double_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_double_to_2xfloat_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = bitcast double %a to <2 x float> + ret <2 x float> %r +} + +define double @test_bitcast_2xfloat_to_double(<2 x float> %a) #0 { +; CHECK-LABEL: test_bitcast_2xfloat_to_double( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitcast_2xfloat_to_double_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %r = bitcast <2 x float> %a to double + ret double %r +} + +define <2 x float> @test_sqrt(<2 x float> %a) #0 { +; CHECK-LABEL: test_sqrt( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; +; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.sqrt(<2 x float> %a) + ret <2 x float> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define <2 x float> @test_powi(<2 x float> %a, <2 x i32> %b) #0 { +; %r = call <2 x float> @llvm.powi.i32(<2 x float> %a, <2 x i32> %b) +; ret <2 x float> %r +;} + +define <2 x float> @test_sin(<2 x float> %a) #0 #1 { +; CHECK-LABEL: test_sin( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NEXT: sin.approx.f32 %r3, %r2; +; CHECK-NEXT: sin.approx.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.sin(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_cos(<2 x float> %a) #0 #1 { +; CHECK-LABEL: test_cos( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NEXT: cos.approx.f32 %r3, %r2; +; CHECK-NEXT: cos.approx.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.cos(<2 x float> %a) + ret <2 x float> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define <2 x float> @test_pow(<2 x float> %a, <2 x float> %b) #0 { +; %r = call <2 x float> @llvm.pow(<2 x float> %a, <2 x float> %b) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define <2 x float> @test_exp(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.exp(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define <2 x float> @test_exp2(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.exp2(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define <2 x float> @test_log(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define <2 x float> @test_log10(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log10(<2 x float> %a) +; ret <2 x float> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define <2 x float> @test_log2(<2 x float> %a) #0 { +; %r = call <2 x float> @llvm.log2(<2 x float> %a) +; ret <2 x float> %r +;} + + +define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +; CHECK-NOF32X2-LABEL: test_fma( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fma( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; +; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-NEXT: ret; + %r = call <2 x float> @llvm.fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_fabs(<2 x float> %a) #0 { +; CHECK-LABEL: test_fabs( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: abs.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.fabs(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_minnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; +; CHECK-NEXT: min.f32 %r5, %r2, %r4; +; CHECK-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_maxnum( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; +; CHECK-NEXT: max.f32 %r5, %r2, %r4; +; CHECK-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-NEXT: copysign.f32 %r5, %r4, %r2; +; CHECK-NEXT: copysign.f32 %r6, %r3, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) + ret <2 x float> %r +} + +define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { +; CHECK-LABEL: test_copysign_f64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r3; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; +; CHECK-NEXT: and.b64 %rd5, %rd4, 1; +; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; +; CHECK-NEXT: abs.f32 %r6, %r1; +; CHECK-NEXT: neg.f32 %r7, %r6; +; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; +; CHECK-NEXT: and.b64 %rd7, %rd6, 1; +; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; +; CHECK-NEXT: ret; + %tb = fptrunc <2 x double> %b to <2 x float> + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %tb) + ret <2 x float> %r +} + +define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: test_copysign_extended( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-NEXT: copysign.f32 %r5, %r3, %r1; +; CHECK-NEXT: copysign.f32 %r6, %r4, %r2; +; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; +; CHECK-NEXT: cvt.f64.f32 %rd4, %r5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.copysign(<2 x float> %a, <2 x float> %b) + %xr = fpext <2 x float> %r to <2 x double> + ret <2 x double> %xr +} + +define <2 x float> @test_floor(<2 x float> %a) #0 { +; CHECK-LABEL: test_floor( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.floor(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_ceil(<2 x float> %a) #0 { +; CHECK-LABEL: test_ceil( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.ceil(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_trunc(<2 x float> %a) #0 { +; CHECK-LABEL: test_trunc( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.trunc(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_rint(<2 x float> %a) #0 { +; CHECK-LABEL: test_rint( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.rint(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_nearbyint(<2 x float> %a) #0 { +; CHECK-LABEL: test_nearbyint( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.nearbyint(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_roundeven(<2 x float> %a) #0 { +; CHECK-LABEL: test_roundeven( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; +; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.roundeven(<2 x float> %a) + ret <2 x float> %r +} + +; check the use of sign mask and 0.5 to implement round +define <2 x float> @test_round(<2 x float> %a) #0 { +; CHECK-LABEL: test_round( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<5>; +; CHECK-NEXT: .reg .b32 %r<19>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; +; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; +; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; +; CHECK-NEXT: abs.f32 %r7, %r2; +; CHECK-NEXT: setp.gt.f32 %p1, %r7, 0f4B000000; +; CHECK-NEXT: selp.f32 %r8, %r2, %r6, %p1; +; CHECK-NEXT: cvt.rzi.f32.f32 %r9, %r2; +; CHECK-NEXT: setp.lt.f32 %p2, %r7, 0f3F000000; +; CHECK-NEXT: selp.f32 %r10, %r9, %r8, %p2; +; CHECK-NEXT: and.b32 %r11, %r1, -2147483648; +; CHECK-NEXT: or.b32 %r12, %r11, 1056964608; +; CHECK-NEXT: add.rn.f32 %r13, %r1, %r12; +; CHECK-NEXT: cvt.rzi.f32.f32 %r14, %r13; +; CHECK-NEXT: abs.f32 %r15, %r1; +; CHECK-NEXT: setp.gt.f32 %p3, %r15, 0f4B000000; +; CHECK-NEXT: selp.f32 %r16, %r1, %r14, %p3; +; CHECK-NEXT: cvt.rzi.f32.f32 %r17, %r1; +; CHECK-NEXT: setp.lt.f32 %p4, %r15, 0f3F000000; +; CHECK-NEXT: selp.f32 %r18, %r17, %r16, %p4; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r18, %r10}; +; CHECK-NEXT: ret; + %r = call <2 x float> @llvm.round(<2 x float> %a) + ret <2 x float> %r +} + +define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 { +; CHECK-NOF32X2-LABEL: test_fmuladd( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<9>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2]; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_fmuladd( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; +; CHECK-F32X2-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-F32X2-NEXT: ret; + %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + +define <2 x float> @test_shufflevector(<2 x float> %a) #0 { +; CHECK-LABEL: test_shufflevector( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NEXT: ret; + %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> + ret <2 x float> %s +} + +define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { +; CHECK-LABEL: test_insertelement( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0]; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NEXT: ret; + %i = insertelement <2 x float> %a, float %x, i64 1 + ret <2 x float> %i +} + +define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = sitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { +; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NEXT: ret; + %r = uitofp <2 x i32> %a to <2 x float> + ret <2 x float> %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } +attributes #2 = { "denormal-fp-math"="preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll new file mode 100644 index 0000000000000..dc0ec0ff7bb0b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-contract-f32x2.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 | FileCheck %s --check-prefixes=CHECK,DEFAULT +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -fp-contract=fast | %ptxas-verify -arch sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 | %ptxas-verify -arch sm_100 %} + +target triple = "nvptx64-unknown-cuda" + +;; FAST-LABEL: @t0 +;; DEFAULT-LABEL: @t0 +define <2 x float> @t0(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; FAST-LABEL: t0( +; FAST: { +; FAST-NEXT: .reg .b64 %rd<5>; +; FAST-EMPTY: +; FAST-NEXT: // %bb.0: +; FAST-NEXT: ld.param.b64 %rd1, [t0_param_0]; +; FAST-NEXT: ld.param.b64 %rd2, [t0_param_1]; +; FAST-NEXT: ld.param.b64 %rd3, [t0_param_2]; +; FAST-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; FAST-NEXT: st.param.b64 [func_retval0], %rd4; +; FAST-NEXT: ret; +; +; DEFAULT-LABEL: t0( +; DEFAULT: { +; DEFAULT-NEXT: .reg .b64 %rd<6>; +; DEFAULT-EMPTY: +; DEFAULT-NEXT: // %bb.0: +; DEFAULT-NEXT: ld.param.b64 %rd1, [t0_param_0]; +; DEFAULT-NEXT: ld.param.b64 %rd2, [t0_param_1]; +; DEFAULT-NEXT: mul.rn.f32x2 %rd3, %rd1, %rd2; +; DEFAULT-NEXT: ld.param.b64 %rd4, [t0_param_2]; +; DEFAULT-NEXT: add.rn.f32x2 %rd5, %rd3, %rd4; +; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; +; DEFAULT-NEXT: ret; + %v0 = fmul <2 x float> %a, %b + %v1 = fadd <2 x float> %v0, %c + ret <2 x float> %v1 +} + +;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32x2 +;; to prevent ptxas from fusing this with anything else. +define <2 x float> @t1(<2 x float> %a, <2 x float> %b) { +; FAST-LABEL: t1( +; FAST: { +; FAST-NEXT: .reg .b64 %rd<6>; +; FAST-EMPTY: +; FAST-NEXT: // %bb.0: +; FAST-NEXT: ld.param.b64 %rd1, [t1_param_0]; +; FAST-NEXT: ld.param.b64 %rd2, [t1_param_1]; +; FAST-NEXT: add.f32x2 %rd3, %rd1, %rd2; +; FAST-NEXT: sub.f32x2 %rd4, %rd1, %rd2; +; FAST-NEXT: mul.f32x2 %rd5, %rd3, %rd4; +; FAST-NEXT: st.param.b64 [func_retval0], %rd5; +; FAST-NEXT: ret; +; +; DEFAULT-LABEL: t1( +; DEFAULT: { +; DEFAULT-NEXT: .reg .b64 %rd<6>; +; DEFAULT-EMPTY: +; DEFAULT-NEXT: // %bb.0: +; DEFAULT-NEXT: ld.param.b64 %rd1, [t1_param_0]; +; DEFAULT-NEXT: ld.param.b64 %rd2, [t1_param_1]; +; DEFAULT-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; +; DEFAULT-NEXT: sub.rn.f32x2 %rd4, %rd1, %rd2; +; DEFAULT-NEXT: mul.rn.f32x2 %rd5, %rd3, %rd4; +; DEFAULT-NEXT: st.param.b64 [func_retval0], %rd5; +; DEFAULT-NEXT: ret; + %v1 = fadd <2 x float> %a, %b + %v2 = fsub <2 x float> %a, %b + %v3 = fmul <2 x float> %v1, %v2 + ret <2 x float> %v3 +} + +;; Make sure we generate the non ".rn" version when the "contract" flag is +;; present on the instructions +define <2 x float> @t2(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: t2( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [t2_param_1]; +; CHECK-NEXT: add.f32x2 %rd3, %rd1, %rd2; +; CHECK-NEXT: sub.f32x2 %rd4, %rd1, %rd2; +; CHECK-NEXT: mul.f32x2 %rd5, %rd3, %rd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd5; +; CHECK-NEXT: ret; + %v1 = fadd contract <2 x float> %a, %b + %v2 = fsub contract <2 x float> %a, %b + %v3 = fmul contract <2 x float> %v1, %v2 + ret <2 x float> %v3 +} + +;; Make sure we always fold to fma when the "contract" flag is present +define <2 x float> @t3(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; CHECK-LABEL: t3( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [t3_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [t3_param_2]; +; CHECK-NEXT: fma.rn.f32x2 %rd4, %rd1, %rd2, %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; +; CHECK-NEXT: ret; + %v0 = fmul contract <2 x float> %a, %b + %v1 = fadd contract <2 x float> %v0, %c + ret <2 x float> %v1 +} diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index e89ab7a5605c3..7efbe6f4ebb36 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -32,57 +32,31 @@ define <2 x i16> @test_ret_const() #0 { } define i16 @test_extract_0(<2 x i16> %a) #0 { -; I16x2-LABEL: test_extract_0( -; I16x2: { -; I16x2-NEXT: .reg .b16 %rs<2>; -; I16x2-NEXT: .reg .b32 %r<3>; -; I16x2-EMPTY: -; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; -; I16x2-NEXT: mov.b32 {%rs1, _}, %r1; -; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; -; I16x2-NEXT: st.param.b32 [func_retval0], %r2; -; I16x2-NEXT: ret; -; -; NO-I16x2-LABEL: test_extract_0( -; NO-I16x2: { -; NO-I16x2-NEXT: .reg .b16 %rs<2>; -; NO-I16x2-NEXT: .reg .b32 %r<3>; -; NO-I16x2-EMPTY: -; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; -; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } -; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; -; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; -; NO-I16x2-NEXT: ret; +; COMMON-LABEL: test_extract_0( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; +; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %e = extractelement <2 x i16> %a, i32 0 ret i16 %e } define i16 @test_extract_1(<2 x i16> %a) #0 { -; I16x2-LABEL: test_extract_1( -; I16x2: { -; I16x2-NEXT: .reg .b16 %rs<2>; -; I16x2-NEXT: .reg .b32 %r<3>; -; I16x2-EMPTY: -; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; -; I16x2-NEXT: mov.b32 {_, %rs1}, %r1; -; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; -; I16x2-NEXT: st.param.b32 [func_retval0], %r2; -; I16x2-NEXT: ret; -; -; NO-I16x2-LABEL: test_extract_1( -; NO-I16x2: { -; NO-I16x2-NEXT: .reg .b16 %rs<2>; -; NO-I16x2-NEXT: .reg .b32 %r<3>; -; NO-I16x2-EMPTY: -; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; -; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } -; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; -; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; -; NO-I16x2-NEXT: ret; +; COMMON-LABEL: test_extract_1( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<3>; +; COMMON-NEXT: .reg .b32 %r<3>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; +; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; +; COMMON-NEXT: st.param.b32 [func_retval0], %r2; +; COMMON-NEXT: ret; %e = extractelement <2 x i16> %a, i32 1 ret i16 %e } @@ -97,9 +71,8 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; COMMON-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; ; COMMON-NEXT: setp.eq.s64 %p1, %rd1, 0; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs3; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; @@ -126,12 +99,10 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; -; NO-I16x2-NEXT: add.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0]; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1]; +; NO-I16x2-NEXT: add.s16 %rs5, %rs2, %rs4; +; NO-I16x2-NEXT: add.s16 %rs6, %rs1, %rs3; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %r = add <2 x i16> %a, %b @@ -157,8 +128,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0]; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -185,8 +155,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0]; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -202,12 +171,10 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r2, [test_sub_param_1]; -; COMMON-NEXT: ld.param.b32 %r1, [test_sub_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; -; COMMON-NEXT: sub.s16 %rs6, %rs3, %rs1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0]; +; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1]; +; COMMON-NEXT: sub.s16 %rs5, %rs2, %rs4; +; COMMON-NEXT: sub.s16 %rs6, %rs1, %rs3; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = sub <2 x i16> %a, %b @@ -232,12 +199,10 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; -; NO-I16x2-NEXT: max.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0]; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1]; +; NO-I16x2-NEXT: max.s16 %rs5, %rs2, %rs4; +; NO-I16x2-NEXT: max.s16 %rs6, %rs1, %rs3; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sgt <2 x i16> %a, %b @@ -263,12 +228,10 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; -; NO-I16x2-NEXT: max.u16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0]; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1]; +; NO-I16x2-NEXT: max.u16 %rs5, %rs2, %rs4; +; NO-I16x2-NEXT: max.u16 %rs6, %rs1, %rs3; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ugt <2 x i16> %a, %b @@ -294,12 +257,10 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; -; NO-I16x2-NEXT: min.s16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0]; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1]; +; NO-I16x2-NEXT: min.s16 %rs5, %rs2, %rs4; +; NO-I16x2-NEXT: min.s16 %rs6, %rs1, %rs3; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sle <2 x i16> %a, %b @@ -325,12 +286,10 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0]; -; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; -; NO-I16x2-NEXT: min.u16 %rs6, %rs3, %rs1; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0]; +; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1]; +; NO-I16x2-NEXT: min.u16 %rs5, %rs2, %rs4; +; NO-I16x2-NEXT: min.u16 %rs6, %rs1, %rs3; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ule <2 x i16> %a, %b @@ -345,12 +304,10 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r2, [test_mul_param_1]; -; COMMON-NEXT: ld.param.b32 %r1, [test_mul_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; -; COMMON-NEXT: mul.lo.s16 %rs6, %rs3, %rs1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0]; +; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1]; +; COMMON-NEXT: mul.lo.s16 %rs5, %rs2, %rs4; +; COMMON-NEXT: mul.lo.s16 %rs6, %rs1, %rs3; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = mul <2 x i16> %a, %b @@ -729,18 +686,14 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; -; COMMON-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; -; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; -; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; -; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; -; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; -; COMMON-NEXT: mov.b32 {%rs5, %rs6}, %r2; -; COMMON-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; COMMON-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; -; COMMON-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; +; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2]; +; COMMON-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3]; +; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs5; +; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs6; +; COMMON-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; +; COMMON-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; +; COMMON-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; COMMON-NEXT: ret; %cc = icmp ne <2 x i16> %c, %d @@ -758,12 +711,10 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; ; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; -; COMMON-NEXT: ld.param.b32 %r6, [test_select_cc_i32_i16_param_3]; -; COMMON-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i16_param_2]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; -; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; -; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; -; COMMON-NEXT: setp.ne.s16 %p2, %rs4, %rs2; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2]; +; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3]; +; COMMON-NEXT: setp.ne.s16 %p1, %rs1, %rs3; +; COMMON-NEXT: setp.ne.s16 %p2, %rs2, %rs4; ; COMMON-NEXT: selp.b32 %r7, %r2, %r4, %p2; ; COMMON-NEXT: selp.b32 %r8, %r1, %r3, %p1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; @@ -784,14 +735,12 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; -; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_i16_i32_param_1]; -; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_i16_i32_param_0]; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0]; ; COMMON-NEXT: setp.ne.s32 %p1, %r3, %r5; ; COMMON-NEXT: setp.ne.s32 %p2, %r4, %r6; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; -; COMMON-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; -; COMMON-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; +; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1]; +; COMMON-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; +; COMMON-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; <2 x i32> %c, <2 x i32> %d) #0 { @@ -902,8 +851,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0]; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; ; COMMON-NEXT: cvt.u32.u16 %r3, %rs1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -920,8 +868,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0]; ; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; ; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; ; COMMON-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -979,8 +926,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; COMMON-NEXT: ret; %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> @@ -988,29 +934,16 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { } define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { -; I16x2-LABEL: test_insertelement( -; I16x2: { -; I16x2-NEXT: .reg .b16 %rs<3>; -; I16x2-NEXT: .reg .b32 %r<2>; -; I16x2-EMPTY: -; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; -; I16x2-NEXT: mov.b32 {%rs2, _}, %r1; -; I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; -; I16x2-NEXT: ret; -; -; NO-I16x2-LABEL: test_insertelement( -; NO-I16x2: { -; NO-I16x2-NEXT: .reg .b16 %rs<3>; -; NO-I16x2-NEXT: .reg .b32 %r<2>; -; NO-I16x2-EMPTY: -; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; NO-I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; -; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } -; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; -; NO-I16x2-NEXT: ret; +; COMMON-LABEL: test_insertelement( +; COMMON: { +; COMMON-NEXT: .reg .b16 %rs<4>; +; COMMON-NEXT: .reg .b32 %r<2>; +; COMMON-EMPTY: +; COMMON-NEXT: // %bb.0: +; COMMON-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; COMMON-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; +; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; COMMON-NEXT: ret; %i = insertelement <2 x i16> %a, i16 %x, i64 1 ret <2 x i16> %i } @@ -1022,8 +955,7 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0]; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -1039,8 +971,7 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0]; -; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0]; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index fd2e56bb126bb..8a467d3024012 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1240,18 +1240,16 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; -; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; +; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; ; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; @@ -1271,18 +1269,16 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; -; CHECK-NEXT: mov.b32 %r3, {%rs4, %rs3}; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; -; CHECK-NEXT: cvt.u32.u16 %r4, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs5; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; +; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs8; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; ; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; +; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; ; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index 9e9705709f2bd..efa2666090ccc 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -7,17 +7,16 @@ declare <4 x float> @bar() define void @foo(ptr %ptr) { ; CHECK-LABEL: foo( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; ; CHECK-NEXT: call.uni (retval0), bar, (); -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [retval0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [retval0]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; ; CHECK-NEXT: ret; %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index a9bd3c1caebe5..187ccc9cd89f7 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -110,11 +110,11 @@ define void @avar_i64() { define void @avar_float() { ; PTX-LABEL: avar_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0 store <8 x float> %load, ptr addrspace(1) @globalout @@ -234,11 +234,11 @@ define void @asi_i64() { define void @asi_float() { ; PTX-LABEL: asi_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -364,14 +364,13 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0 store <8 x float> %load, ptr addrspace(1) %out @@ -510,14 +509,13 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index 45e17016d8ee8..a17df1ee39883 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -106,11 +106,11 @@ define void @avar_i64() { define void @avar_float() { ; PTX-LABEL: avar_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) @globalin store <8 x float> %load, ptr addrspace(1) @globalout @@ -230,11 +230,11 @@ define void @asi_i64() { define void @asi_float() { ; PTX-LABEL: asi_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset @@ -360,14 +360,13 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) %in store <8 x float> %load, ptr addrspace(1) %out @@ -506,14 +505,13 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_float( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index dfbc2c34b15d4..68c53cde7f9ac 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -139,16 +139,15 @@ define void @generic_4xi64(ptr %a, ptr %b) { define void @generic_8xfloat(ptr %a, ptr %b) { ; CHECK-LABEL: generic_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; -; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1]; -; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_8xfloat_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr %a store <8 x float> %a.load, ptr %b @@ -291,16 +290,15 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) { define void @generic_volatile_8xfloat(ptr %a, ptr %b) { ; CHECK-LABEL: generic_volatile_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr %a store volatile <8 x float> %a.load, ptr %b @@ -516,28 +514,26 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b32 %r<9>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b64 %rd<7>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; -; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; -; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_8xfloat( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; -; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; -; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_8xfloat_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(1) %a store <8 x float> %a.load, ptr addrspace(1) %b @@ -762,28 +758,26 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-LABEL: global_volatile_8xfloat( ; SM90: { -; SM90-NEXT: .reg .b32 %r<9>; -; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b64 %rd<7>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; -; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; -; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_8xfloat( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; -; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; -; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(1) %a store volatile <8 x float> %a.load, ptr addrspace(1) %b @@ -939,16 +933,15 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-LABEL: shared_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; -; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1]; -; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [shared_8xfloat_param_1]; +; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(3) %a store <8 x float> %a.load, ptr addrspace(3) %b @@ -1091,16 +1084,15 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-LABEL: shared_volatile_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(3) %a store volatile <8 x float> %a.load, ptr addrspace(3) %b @@ -1245,16 +1237,15 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-LABEL: local_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [local_8xfloat_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(5) %a store <8 x float> %a.load, ptr addrspace(5) %b @@ -1397,16 +1388,15 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-LABEL: local_volatile_8xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(5) %a store volatile <8 x float> %a.load, ptr addrspace(5) %b diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index d494ee30c2821..761b0a9cdc94a 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -333,30 +333,28 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo11( ; SM20: { -; SM20-NEXT: .reg .b32 %r<3>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b64 %rd<6>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd2]; -; SM20-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; +; SM20-NEXT: ld.global.b64 %rd5, [%rd2]; +; SM20-NEXT: st.global.b64 [%rd4], %rd5; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo11( ; SM35: { -; SM35-NEXT: .reg .b32 %r<3>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b64 %rd<6>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo11_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo11_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd2]; -; SM35-NEXT: st.global.v2.b32 [%rd4], {%r1, %r2}; +; SM35-NEXT: ld.global.nc.b64 %rd5, [%rd2]; +; SM35-NEXT: st.global.b64 [%rd4], %rd5; ; SM35-NEXT: ret; %1 = load <2 x float>, ptr %from store <2 x float> %1, ptr %to @@ -496,30 +494,28 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { ; SM20-LABEL: foo16( ; SM20: { -; SM20-NEXT: .reg .b32 %r<5>; -; SM20-NEXT: .reg .b64 %rd<5>; +; SM20-NEXT: .reg .b64 %rd<7>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: ; SM20-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM20-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM20-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM20-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM20-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; SM20-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; +; SM20-NEXT: ld.global.v2.b64 {%rd5, %rd6}, [%rd2]; +; SM20-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; ; SM20-NEXT: ret; ; ; SM35-LABEL: foo16( ; SM35: { -; SM35-NEXT: .reg .b32 %r<5>; -; SM35-NEXT: .reg .b64 %rd<5>; +; SM35-NEXT: .reg .b64 %rd<7>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: ; SM35-NEXT: ld.param.b64 %rd1, [foo16_param_0]; ; SM35-NEXT: cvta.to.global.u64 %rd2, %rd1; ; SM35-NEXT: ld.param.b64 %rd3, [foo16_param_1]; ; SM35-NEXT: cvta.to.global.u64 %rd4, %rd3; -; SM35-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; SM35-NEXT: st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4}; +; SM35-NEXT: ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2]; +; SM35-NEXT: st.global.v2.b64 [%rd4], {%rd5, %rd6}; ; SM35-NEXT: ret; %1 = load <4 x float>, ptr %from store <4 x float> %1, ptr %to diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index db8733da5b7e4..dfdb33852305b 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -1,131 +1,278 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-nvidia-cuda" -; CHECK-LABEL: t1 define <4 x float> @t1(ptr %p1) { -; CHECK-NOT: ld.v4 -; CHECK-NOT: ld.v2 -; CHECK-NOT: ld.b32 -; CHECK: ld.b8 +; CHECK-LABEL: t1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<46>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0]; +; CHECK-NEXT: ld.b8 %rd2, [%rd1+8]; +; CHECK-NEXT: ld.b8 %rd3, [%rd1+9]; +; CHECK-NEXT: shl.b64 %rd4, %rd3, 8; +; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; +; CHECK-NEXT: ld.b8 %rd6, [%rd1+10]; +; CHECK-NEXT: shl.b64 %rd7, %rd6, 16; +; CHECK-NEXT: ld.b8 %rd8, [%rd1+11]; +; CHECK-NEXT: shl.b64 %rd9, %rd8, 24; +; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7; +; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5; +; CHECK-NEXT: ld.b8 %rd12, [%rd1+12]; +; CHECK-NEXT: ld.b8 %rd13, [%rd1+13]; +; CHECK-NEXT: shl.b64 %rd14, %rd13, 8; +; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12; +; CHECK-NEXT: ld.b8 %rd16, [%rd1+14]; +; CHECK-NEXT: shl.b64 %rd17, %rd16, 16; +; CHECK-NEXT: ld.b8 %rd18, [%rd1+15]; +; CHECK-NEXT: shl.b64 %rd19, %rd18, 24; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17; +; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15; +; CHECK-NEXT: shl.b64 %rd22, %rd21, 32; +; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11; +; CHECK-NEXT: ld.b8 %rd24, [%rd1]; +; CHECK-NEXT: ld.b8 %rd25, [%rd1+1]; +; CHECK-NEXT: shl.b64 %rd26, %rd25, 8; +; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24; +; CHECK-NEXT: ld.b8 %rd28, [%rd1+2]; +; CHECK-NEXT: shl.b64 %rd29, %rd28, 16; +; CHECK-NEXT: ld.b8 %rd30, [%rd1+3]; +; CHECK-NEXT: shl.b64 %rd31, %rd30, 24; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29; +; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27; +; CHECK-NEXT: ld.b8 %rd34, [%rd1+4]; +; CHECK-NEXT: ld.b8 %rd35, [%rd1+5]; +; CHECK-NEXT: shl.b64 %rd36, %rd35, 8; +; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34; +; CHECK-NEXT: ld.b8 %rd38, [%rd1+6]; +; CHECK-NEXT: shl.b64 %rd39, %rd38, 16; +; CHECK-NEXT: ld.b8 %rd40, [%rd1+7]; +; CHECK-NEXT: shl.b64 %rd41, %rd40, 24; +; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39; +; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37; +; CHECK-NEXT: shl.b64 %rd44, %rd43, 32; +; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23}; +; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 1 ret <4 x float> %r } -; CHECK-LABEL: t2 define <4 x float> @t2(ptr %p1) { -; CHECK-NOT: ld.v4 -; CHECK-NOT: ld.v2 -; CHECK: ld.b32 +; CHECK-LABEL: t2( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; +; CHECK-NEXT: ld.b32 %rd2, [%rd1+8]; +; CHECK-NEXT: ld.b32 %rd3, [%rd1+12]; +; CHECK-NEXT: shl.b64 %rd4, %rd3, 32; +; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2; +; CHECK-NEXT: ld.b32 %rd6, [%rd1]; +; CHECK-NEXT: ld.b32 %rd7, [%rd1+4]; +; CHECK-NEXT: shl.b64 %rd8, %rd7, 32; +; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5}; +; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 4 ret <4 x float> %r } -; CHECK-LABEL: t3 define <4 x float> @t3(ptr %p1) { -; CHECK-NOT: ld.v4 -; CHECK: ld.v2 +; CHECK-LABEL: t3( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0]; +; CHECK-NEXT: ld.b64 %rd2, [%rd1+8]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; +; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 8 ret <4 x float> %r } -; CHECK-LABEL: t4 define <4 x float> @t4(ptr %p1) { -; CHECK: ld.v4 +; CHECK-LABEL: t4( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3}; +; CHECK-NEXT: ret; %r = load <4 x float>, ptr %p1, align 16 ret <4 x float> %r } -; CHECK-LABEL: .visible .func test_v1halfp0a1( -; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0]; -; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1]; -; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.b8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]] -; CHECK: ret define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) { +; CHECK-LABEL: test_v1halfp0a1( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v1halfp0a1_param_0]; +; CHECK-NEXT: ld.b8 %rs1, [%rd1]; +; CHECK-NEXT: ld.b8 %rs2, [%rd1+1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_v1halfp0a1_param_1]; +; CHECK-NEXT: st.b8 [%rd2+1], %rs2; +; CHECK-NEXT: st.b8 [%rd2], %rs1; +; CHECK-NEXT: ret; %1 = load <1 x half>, ptr %from , align 1 store <1 x half> %1, ptr %to , align 1 ret void } -; CHECK-LABEL: .visible .func test_v2halfp0a1( -; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0]; -; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1]; -; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.b8 [%[[TO]]], -; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.b8 [%[[TO]]+1], -; CHECK-DAG: ld.b8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] -; CHECK-DAG: st.b8 [%[[TO]]+2], -; CHECK-DAG: ld.b8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] -; CHECK-DAG: st.b8 [%[[TO]]+3], -; CHECK: ret define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) { +; CHECK-LABEL: test_v2halfp0a1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2halfp0a1_param_0]; +; CHECK-NEXT: ld.b8 %r1, [%rd1+1]; +; CHECK-NEXT: ld.b8 %r2, [%rd1]; +; CHECK-NEXT: ld.b8 %r3, [%rd1+3]; +; CHECK-NEXT: ld.b8 %r4, [%rd1+2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_v2halfp0a1_param_1]; +; CHECK-NEXT: st.b8 [%rd2+2], %r4; +; CHECK-NEXT: st.b8 [%rd2+3], %r3; +; CHECK-NEXT: st.b8 [%rd2], %r2; +; CHECK-NEXT: st.b8 [%rd2+1], %r1; +; CHECK-NEXT: ret; %1 = load <2 x half>, ptr %from , align 1 store <2 x half> %1, ptr %to , align 1 ret void } -; CHECK-LABEL: .visible .func test_v4halfp0a1( -; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0]; -; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1]; -; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.b8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]] -; CHECK-DAG: ld.b8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] -; CHECK-DAG: st.b8 [%[[TO]]+2], [[B2]] -; CHECK-DAG: ld.b8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] -; CHECK-DAG: st.b8 [%[[TO]]+3], [[B3]] -; CHECK-DAG: ld.b8 [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4] -; CHECK-DAG: st.b8 [%[[TO]]+4], [[B4]] -; CHECK-DAG: ld.b8 [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5] -; CHECK-DAG: st.b8 [%[[TO]]+5], [[B5]] -; CHECK-DAG: ld.b8 [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6] -; CHECK-DAG: st.b8 [%[[TO]]+6], [[B6]] -; CHECK-DAG: ld.b8 [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7] -; CHECK-DAG: st.b8 [%[[TO]]+7], [[B7]] -; CHECK: ret define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { +; CHECK-LABEL: test_v4halfp0a1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v4halfp0a1_param_0]; +; CHECK-NEXT: ld.b8 %r1, [%rd1+1]; +; CHECK-NEXT: ld.b8 %r2, [%rd1]; +; CHECK-NEXT: ld.b8 %r3, [%rd1+3]; +; CHECK-NEXT: ld.b8 %r4, [%rd1+2]; +; CHECK-NEXT: ld.b8 %r5, [%rd1+5]; +; CHECK-NEXT: ld.b8 %r6, [%rd1+4]; +; CHECK-NEXT: ld.b8 %r7, [%rd1+7]; +; CHECK-NEXT: ld.b8 %r8, [%rd1+6]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_v4halfp0a1_param_1]; +; CHECK-NEXT: st.b8 [%rd2+6], %r8; +; CHECK-NEXT: st.b8 [%rd2+7], %r7; +; CHECK-NEXT: st.b8 [%rd2+4], %r6; +; CHECK-NEXT: st.b8 [%rd2+5], %r5; +; CHECK-NEXT: st.b8 [%rd2+2], %r4; +; CHECK-NEXT: st.b8 [%rd2+3], %r3; +; CHECK-NEXT: st.b8 [%rd2], %r2; +; CHECK-NEXT: st.b8 [%rd2+1], %r1; +; CHECK-NEXT: ret; %1 = load <4 x half>, ptr %from , align 1 store <4 x half> %1, ptr %to , align 1 ret void } -; CHECK-LABEL: s1 define void @s1(ptr %p1, <4 x float> %v) { -; CHECK-NOT: st.v4 -; CHECK-NOT: st.v2 -; CHECK-NOT: st.b32 -; CHECK: st.b8 +; CHECK-LABEL: s1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<18>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1]; +; CHECK-NEXT: st.b8 [%rd1+8], %rd3; +; CHECK-NEXT: st.b8 [%rd1], %rd2; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 56; +; CHECK-NEXT: st.b8 [%rd1+15], %rd4; +; CHECK-NEXT: shr.u64 %rd5, %rd3, 48; +; CHECK-NEXT: st.b8 [%rd1+14], %rd5; +; CHECK-NEXT: shr.u64 %rd6, %rd3, 40; +; CHECK-NEXT: st.b8 [%rd1+13], %rd6; +; CHECK-NEXT: shr.u64 %rd7, %rd3, 32; +; CHECK-NEXT: st.b8 [%rd1+12], %rd7; +; CHECK-NEXT: shr.u64 %rd8, %rd3, 24; +; CHECK-NEXT: st.b8 [%rd1+11], %rd8; +; CHECK-NEXT: shr.u64 %rd9, %rd3, 16; +; CHECK-NEXT: st.b8 [%rd1+10], %rd9; +; CHECK-NEXT: shr.u64 %rd10, %rd3, 8; +; CHECK-NEXT: st.b8 [%rd1+9], %rd10; +; CHECK-NEXT: shr.u64 %rd11, %rd2, 56; +; CHECK-NEXT: st.b8 [%rd1+7], %rd11; +; CHECK-NEXT: shr.u64 %rd12, %rd2, 48; +; CHECK-NEXT: st.b8 [%rd1+6], %rd12; +; CHECK-NEXT: shr.u64 %rd13, %rd2, 40; +; CHECK-NEXT: st.b8 [%rd1+5], %rd13; +; CHECK-NEXT: shr.u64 %rd14, %rd2, 32; +; CHECK-NEXT: st.b8 [%rd1+4], %rd14; +; CHECK-NEXT: shr.u64 %rd15, %rd2, 24; +; CHECK-NEXT: st.b8 [%rd1+3], %rd15; +; CHECK-NEXT: shr.u64 %rd16, %rd2, 16; +; CHECK-NEXT: st.b8 [%rd1+2], %rd16; +; CHECK-NEXT: shr.u64 %rd17, %rd2, 8; +; CHECK-NEXT: st.b8 [%rd1+1], %rd17; +; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 1 ret void } -; CHECK-LABEL: s2 define void @s2(ptr %p1, <4 x float> %v) { -; CHECK-NOT: st.v4 -; CHECK-NOT: st.v2 -; CHECK: st.b32 +; CHECK-LABEL: s2( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1]; +; CHECK-NEXT: st.b32 [%rd1+8], %rd3; +; CHECK-NEXT: st.b32 [%rd1], %rd2; +; CHECK-NEXT: shr.u64 %rd4, %rd3, 32; +; CHECK-NEXT: st.b32 [%rd1+12], %rd4; +; CHECK-NEXT: shr.u64 %rd5, %rd2, 32; +; CHECK-NEXT: st.b32 [%rd1+4], %rd5; +; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 4 ret void } -; CHECK-LABEL: s3 define void @s3(ptr %p1, <4 x float> %v) { -; CHECK-NOT: st.v4 +; CHECK-LABEL: s3( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1]; +; CHECK-NEXT: st.b64 [%rd1+8], %rd3; +; CHECK-NEXT: st.b64 [%rd1], %rd2; +; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 8 ret void } -; CHECK-LABEL: s4 define void @s4(ptr %p1, <4 x float> %v) { -; CHECK: st.v4 +; CHECK-LABEL: s4( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3}; +; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 16 ret void } diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll index aa463b510fe84..c78fcddb7ed0f 100644 --- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll +++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} @@ -8,23 +9,41 @@ @gv_float2 = external constant <2 x float> @gv_float4 = external constant <4 x float> -; CHECK-LABEL: test_gv_float() define float @test_gv_float() { -; CHECK: ld.global.nc.b32 +; CHECK-LABEL: test_gv_float( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.nc.b32 %r1, [gv_float]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %v = load float, ptr @gv_float ret float %v } -; CHECK-LABEL: test_gv_float2() define <2 x float> @test_gv_float2() { -; CHECK: ld.global.nc.v2.b32 +; CHECK-LABEL: test_gv_float2( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.nc.b64 %rd1, [gv_float2]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; %v = load <2 x float>, ptr @gv_float2 ret <2 x float> %v } -; CHECK-LABEL: test_gv_float4() define <4 x float> @test_gv_float4() { -; CHECK: ld.global.nc.v4.b32 +; CHECK-LABEL: test_gv_float4( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.global.nc.v2.b64 {%rd1, %rd2}, [gv_float4]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; %v = load <4 x float>, ptr @gv_float4 ret <4 x float> %v } diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index e10949f95fac4..87f965c84b6b6 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -114,18 +114,19 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fadd_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r1, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r2; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r3; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r5; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r6; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r7; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r8; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0]; +; CHECK-NEXT: add.rn.f32 %r9, %r5, 0f00000000; +; CHECK-NEXT: add.rn.f32 %r10, %r9, %r6; +; CHECK-NEXT: add.rn.f32 %r11, %r10, %r7; +; CHECK-NEXT: add.rn.f32 %r12, %r11, %r8; +; CHECK-NEXT: add.rn.f32 %r13, %r12, %r1; +; CHECK-NEXT: add.rn.f32 %r14, %r13, %r2; +; CHECK-NEXT: add.rn.f32 %r15, %r14, %r3; +; CHECK-NEXT: add.rn.f32 %r16, %r15, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) @@ -133,45 +134,89 @@ define float @reduce_fadd_float(<8 x float> %in) { } define float @reduce_fadd_float_reassoc(<8 x float> %in) { -; CHECK-LABEL: reduce_fadd_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<17>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: add.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: add.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r13; -; CHECK-NEXT: add.rn.f32 %r16, %r15, 0f00000000; -; CHECK-NEXT: st.param.b32 [func_retval0], %r16; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<17>; +; CHECK-SM80-NEXT: .reg .b64 %rd<5>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r7, %r3; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r5, %r1; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r8, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r11; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<5>; +; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; +; CHECK-SM100-NEXT: // implicit-def: %r2 +; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd9, %rd7, %rd8; +; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: add.rn.f32 %r4, %r3, 0f00000000; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res } define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { -; CHECK-LABEL: reduce_fadd_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<15>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: add.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: add.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: add.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: add.rn.f32 %r13, %r10, %r12; -; CHECK-NEXT: add.rn.f32 %r14, %r13, 0f00000000; -; CHECK-NEXT: st.param.b32 [func_retval0], %r14; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fadd_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<15>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: add.rn.f32 %r8, %r3, %r7; +; CHECK-SM80-NEXT: add.rn.f32 %r9, %r1, %r5; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r2, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: add.rn.f32 %r13, %r10, %r12; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r13, 0f00000000; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r14; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fadd_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<13>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [reduce_fadd_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd4, %rd2, %rd1; +; CHECK-SM100-NEXT: mov.b32 %r8, 0f80000000; +; CHECK-SM100-NEXT: mov.b64 %rd5, {%r7, %r8}; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd6, %rd3, %rd5; +; CHECK-SM100-NEXT: add.rn.f32x2 %rd7, %rd4, %rd6; +; CHECK-SM100-NEXT: mov.b64 {%r9, %r10}, %rd7; +; CHECK-SM100-NEXT: add.rn.f32 %r11, %r9, %r10; +; CHECK-SM100-NEXT: add.rn.f32 %r12, %r11, 0f00000000; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r12; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in) ret float %res } @@ -273,17 +318,18 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmul_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r2; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r3; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r4; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r5; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r6; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r7; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r8; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0]; +; CHECK-NEXT: mul.rn.f32 %r9, %r5, %r6; +; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r7; +; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r8; +; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r1; +; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r2; +; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r3; +; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -291,43 +337,85 @@ define float @reduce_fmul_float(<8 x float> %in) { } define float @reduce_fmul_float_reassoc(<8 x float> %in) { -; CHECK-LABEL: reduce_fmul_float_reassoc( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r10, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r11, %r4, %r8; -; CHECK-NEXT: mul.rn.f32 %r12, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-NEXT: mul.rn.f32 %r14, %r10, %r9; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r13; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float_reassoc( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-NEXT: .reg .b64 %rd<5>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r7, %r3; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r5, %r1; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r8, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r11; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r13; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float_reassoc( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<4>; +; CHECK-SM100-NEXT: .reg .b64 %rd<10>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd5, %rd2, %rd4; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd6, %rd1, %rd3; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd7, %rd6, %rd5; +; CHECK-SM100-NEXT: mov.b64 {_, %r1}, %rd7; +; CHECK-SM100-NEXT: // implicit-def: %r2 +; CHECK-SM100-NEXT: mov.b64 %rd8, {%r1, %r2}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd9, %rd7, %rd8; +; CHECK-SM100-NEXT: mov.b64 {%r3, _}, %rd9; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res } define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) { -; CHECK-LABEL: reduce_fmul_float_reassoc_nonpow2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; -; CHECK-NEXT: mul.rn.f32 %r8, %r3, %r7; -; CHECK-NEXT: mul.rn.f32 %r9, %r1, %r5; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r8; -; CHECK-NEXT: mul.rn.f32 %r11, %r2, %r6; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r4; -; CHECK-NEXT: mul.rn.f32 %r13, %r10, %r12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r13; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_fmul_float_reassoc_nonpow2( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<14>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM80-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; +; CHECK-SM80-NEXT: mul.rn.f32 %r8, %r3, %r7; +; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r1, %r5; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r9, %r8; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r2, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r11, %r4; +; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r10, %r12; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r13; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_fmul_float_reassoc_nonpow2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<12>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; +; CHECK-SM100-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [reduce_fmul_float_reassoc_nonpow2_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-SM100-NEXT: ld.param.b32 %r7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd1; +; CHECK-SM100-NEXT: mov.b32 %r8, 0f3F800000; +; CHECK-SM100-NEXT: mov.b64 %rd5, {%r7, %r8}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd6, %rd3, %rd5; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd7, %rd4, %rd6; +; CHECK-SM100-NEXT: mov.b64 {%r9, %r10}, %rd7; +; CHECK-SM100-NEXT: mul.rn.f32 %r11, %r9, %r10; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r11; +; CHECK-SM100-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in) ret float %res } @@ -403,15 +491,16 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmax_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0]; +; CHECK-NEXT: max.f32 %r9, %r8, %r4; +; CHECK-NEXT: max.f32 %r10, %r6, %r2; ; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; +; CHECK-NEXT: max.f32 %r12, %r7, %r3; +; CHECK-NEXT: max.f32 %r13, %r5, %r1; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -425,15 +514,16 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmax_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: max.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-NEXT: max.f32 %r9, %r8, %r4; +; CHECK-NEXT: max.f32 %r10, %r6, %r2; ; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.f32 %r13, %r1, %r5; +; CHECK-NEXT: max.f32 %r12, %r7, %r3; +; CHECK-NEXT: max.f32 %r13, %r5, %r1; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -535,15 +625,16 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmin_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0]; +; CHECK-NEXT: min.f32 %r9, %r8, %r4; +; CHECK-NEXT: min.f32 %r10, %r6, %r2; ; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; +; CHECK-NEXT: min.f32 %r12, %r7, %r3; +; CHECK-NEXT: min.f32 %r13, %r5, %r1; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -557,15 +648,16 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmin_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: min.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-NEXT: min.f32 %r9, %r8, %r4; +; CHECK-NEXT: min.f32 %r10, %r6, %r2; ; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.f32 %r13, %r1, %r5; +; CHECK-NEXT: min.f32 %r12, %r7, %r3; +; CHECK-NEXT: min.f32 %r13, %r5, %r1; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -667,15 +759,16 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fmaximum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0]; +; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; +; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; ; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; +; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -689,15 +782,16 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fmaximum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: max.NaN.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; +; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; ; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: max.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; +; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -799,15 +893,16 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-LABEL: reduce_fminimum_float( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0]; +; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; +; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; ; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; +; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -821,15 +916,16 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-LABEL: reduce_fminimum_float_reassoc( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<16>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r4, %r8; -; CHECK-NEXT: min.NaN.f32 %r10, %r2, %r6; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; +; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; ; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r7; -; CHECK-NEXT: min.NaN.f32 %r13, %r1, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; +; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 765e50554c8d2..29939e323b4b1 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} @@ -5,75 +6,104 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-LABEL: test_v16f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_12_15:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; -; CHECK-DAG: ld.param.v4.b32 {[[V_8_11:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; -; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+32], {[[V_8_11]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+48], {[[V_12_15]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <16 x float> %a } define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <8 x float> %a } define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( -; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%r[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <4 x float> %a } define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( -; CHECK-DAG: ld.param.v2.b32 {[[V_0_3:(%r[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_3]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; ret <2 x float> %a } ; Oddly shaped vectors should not load any extra elements. define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( -; CHECK-DAG: ld.param.b32 [[V_2:%r[0-9]+]], [test_v3f32_param_0+8]; -; CHECK-DAG: ld.param.v2.b32 {[[V_0_1:(%r[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; -; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_1]]} -; CHECK-DAG: st.param.b32 [func_retval0+8], [[V_2]] -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; +; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; ret <3 x float> %a } define <8 x i64> @test_v8i64(<8 x i64> %a) { ; CHECK-LABEL: test_v8i64( -; CHECK-DAG: ld.param.v2.b64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; -; CHECK-DAG: ld.param.v2.b64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; -; CHECK-DAG: ld.param.v2.b64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; -; CHECK-DAG: ld.param.v2.b64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; -; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_1]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} -; CHECK-DAG: st.param.v2.b64 [func_retval0+48], {[[V_6_7]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8i64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8i64_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v8i64_param_0+32]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v8i64_param_0+48]; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; +; CHECK-NEXT: ret; ret <8 x i64> %a } define <16 x i16> @test_v16i16(<16 x i16> %a) { ; CHECK-LABEL: test_v16i16( -; CHECK-DAG: ld.param.v4.b32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; -; CHECK-DAG: ld.param.v4.b32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; -; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_7]]} -; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_8_15]]} -; CHECK: ret; +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16i16_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16i16_param_0+16]; +; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; ret <16 x i16> %a } diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll index b08c19206a0b8..17468d56aa574 100644 --- a/llvm/test/CodeGen/NVPTX/vector-args.ll +++ b/llvm/test/CodeGen/NVPTX/vector-args.ll @@ -1,10 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define float @foo(<2 x float> %a) { -; CHECK: .func (.param .b32 func_retval0) foo -; CHECK: .param .align 8 .b8 foo_param_0[8] -; CHECK: ld.param.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo_param_0]; +; CHECK-NEXT: mul.rn.f32 %r3, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r4, %r1, %r1; +; CHECK-NEXT: add.rn.f32 %r5, %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: ret; %t1 = fmul <2 x float> %a, %a %t2 = extractelement <2 x float> %t1, i32 0 %t3 = extractelement <2 x float> %t1, i32 1 @@ -14,9 +23,17 @@ define float @foo(<2 x float> %a) { define float @bar(<4 x float> %a) { -; CHECK: .func (.param .b32 func_retval0) bar -; CHECK: .param .align 16 .b8 bar_param_0[16] -; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: bar( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [bar_param_0]; +; CHECK-NEXT: mul.rn.f32 %r5, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r6, %r1, %r1; +; CHECK-NEXT: add.rn.f32 %r7, %r6, %r5; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %t1 = fmul <4 x float> %a, %a %t2 = extractelement <4 x float> %t1, i32 0 %t3 = extractelement <4 x float> %t1, i32 1 @@ -26,10 +43,18 @@ define float @bar(<4 x float> %a) { define <4 x float> @baz(<4 x float> %a) { -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) baz -; CHECK: .param .align 16 .b8 baz_param_0[16] -; CHECK: ld.param.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} -; CHECK: st.param.v4.b32 [func_retval0], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-LABEL: baz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [baz_param_0]; +; CHECK-NEXT: mul.rn.f32 %r5, %r4, %r4; +; CHECK-NEXT: mul.rn.f32 %r6, %r3, %r3; +; CHECK-NEXT: mul.rn.f32 %r7, %r2, %r2; +; CHECK-NEXT: mul.rn.f32 %r8, %r1, %r1; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; %t1 = fmul <4 x float> %a, %a ret <4 x float> %t1 } diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index 1ae6f6bcd748f..e16fc74325416 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -207,18 +207,18 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_global_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_global_a16_param_1]; ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; ; CHECK-NEXT: st.global.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; @@ -271,18 +271,18 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia ; CHECK-NEXT: ld.param.b64 %rd1, [extv8f16_generic_a16_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [extv8f16_generic_a16_param_1]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; -; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r4; -; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r2; -; CHECK-NEXT: cvt.f32.f16 %r5, %rs8; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs7; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs6; -; CHECK-NEXT: cvt.f32.f16 %r8, %rs5; -; CHECK-NEXT: cvt.f32.f16 %r9, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs2; -; CHECK-NEXT: cvt.f32.f16 %r12, %rs1; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs4; +; CHECK-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r4; +; CHECK-NEXT: cvt.f32.f16 %r9, %rs6; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs5; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs8; +; CHECK-NEXT: cvt.f32.f16 %r12, %rs7; ; CHECK-NEXT: st.v4.b32 [%rd1+16], {%r12, %r11, %r10, %r9}; ; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index f3b1015070085..d07c740d32a72 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -1,38 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; CHECK-LABEL: .visible .func foo1 -; CHECK: st.v2.b32 define void @foo1(<2 x float> %val, ptr %ptr) { +; CHECK-LABEL: foo1( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [foo1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [foo1_param_1]; +; CHECK-NEXT: st.b64 [%rd2], %rd1; +; CHECK-NEXT: ret; store <2 x float> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo2 -; CHECK: st.v4.b32 define void @foo2(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: foo2( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd3, [foo2_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd3], {%rd1, %rd2}; +; CHECK-NEXT: ret; store <4 x float> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo3 -; CHECK: st.v2.b32 define void @foo3(<2 x i32> %val, ptr %ptr) { +; CHECK-LABEL: foo3( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo3_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo3_param_1]; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2}; +; CHECK-NEXT: ret; store <2 x i32> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func foo4 -; CHECK: st.v4.b32 define void @foo4(<4 x i32> %val, ptr %ptr) { +; CHECK-LABEL: foo4( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [foo4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo4_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; store <4 x i32> %val, ptr %ptr ret void } -; CHECK-LABEL: .visible .func v16i8 define void @v16i8(ptr %a, ptr %b) { -; CHECK: ld.v4.b32 -; CHECK: st.v4.b32 +; CHECK-LABEL: v16i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [v16i8_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [v16i8_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; %v = load <16 x i8>, ptr %a store <16 x i8> %v, ptr %b ret void