diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index f5f4d71236fee..a48dd0e5fedba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3356,6 +3356,32 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { Results.push_back(Op); break; } + case ISD::FCANONICALIZE: { + // This implements llvm.canonicalize.f* by multiplication with 1.0, as + // suggested in + // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic. + // It uses strict_fp operations even outside a strict_fp context in order + // to guarantee that the canonicalization is not optimized away by later + // passes. The result chain introduced by that is intentionally ignored + // since no ordering requirement is intended here. + + // Create strict multiplication by 1.0. + SDValue Operand = Node->getOperand(0); + EVT VT = Operand.getValueType(); + SDValue One = DAG.getConstantFP(1.0, dl, VT); + SDValue Chain = DAG.getEntryNode(); + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, + {Chain, Operand, One}); + + // Propagate existing flags on canonicalize, and additionally set + // NoFPExcept. + SDNodeFlags CanonicalizeFlags = Node->getFlags(); + CanonicalizeFlags.setNoFPExcept(true); + Mul->setFlags(CanonicalizeFlags); + + Results.push_back(Mul); + break; + } case ISD::SIGN_EXTEND_INREG: { EVT ExtraVT = cast(Node->getOperand(1))->getVT(); EVT VT = Node->getValueType(0); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 4f722005f4056..c9f927ade4f1f 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -875,6 +875,10 @@ void TargetLoweringBase::initActions() { ISD::FATAN2}, {MVT::f32, MVT::f64, MVT::f128}, Expand); + // Insert custom handling default for llvm.canonicalize.*. + setOperationAction(ISD::FCANONICALIZE, + {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand); + // FIXME: Query RuntimeLibCalls to make the decision. setOperationAction({ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND}, {MVT::f32, MVT::f64, MVT::f128}, LibCall); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0d388fc3c787d..cdb68684b3856 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -768,6 +768,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v8bf16, Expand); } + // Legalize fcanonicalize to circumvent default expansion + setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal); + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); + } + // fpextend from f16 or bf16 to f32 is legal setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e32accaba85fc..26be37046e2e3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -423,6 +423,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16, Custom); + setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FCANONICALIZE, MVT::f16, Legal); + } + // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches // scalarization code. Can be removed when IS_FPCLASS expand isn't called by // default unless marked custom/legal. diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 602ce240814e9..72dbb44815657 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -195,6 +195,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal); setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal); setOperationAction(ISD::IS_FPCLASS, MVT::f32, Legal); @@ -242,6 +243,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); setOperationAction(ISD::IS_FPCLASS, MVT::f64, Legal); setOperationAction(ISD::FSIN, MVT::f64, Expand); diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 5a15b7d9849d3..74a256e9729b3 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -373,6 +373,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); setOperationAction(ISD::IS_FPCLASS, MVT::f32, Legal); setOperationAction(ISD::IS_FPCLASS, MVT::f64, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Legal); } else { setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 3616b86a7fd3f..b96505816dee8 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -777,6 +777,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f64, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::f32, Legal); } if (Subtarget.hasAltivec()) { diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll new file mode 100644 index 0000000000000..e02f931c4d31e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll @@ -0,0 +1,405 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march s390x-unknown-linux-gnu --version 5 +; RUN: llc -mtriple=s390x-linux-gnu -mcpu=z16 < %s | FileCheck %s -check-prefixes=Z16 + +define half @canonicalize_fp16(half %a) nounwind { +; Z16-LABEL: canonicalize_fp16: +; Z16: # %bb.0: +; Z16-NEXT: stmg %r14, %r15, 112(%r15) +; Z16-NEXT: aghi %r15, -160 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: lmg %r14, %r15, 272(%r15) +; Z16-NEXT: br %r14 + %canonicalized = call half @llvm.canonicalize.f16(half %a) + ret half %canonicalized +} +define float @canonicalize_fp32(float %a) { +; Z16-LABEL: canonicalize_fp32: +; Z16: # %bb.0: +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: br %r14 + %canonicalized = call float @llvm.canonicalize.f32(float %a) + ret float %canonicalized +} + +define double @canonicalize_fp64(double %a) { +; Z16-LABEL: canonicalize_fp64: +; Z16: # %bb.0: +; Z16-NEXT: vgmg %v1, 2, 11 +; Z16-NEXT: mdbr %f0, %f1 +; Z16-NEXT: br %r14 + %canonicalized = call double @llvm.canonicalize.f64(double %a) + ret double %canonicalized +} + +define fp128 @canonicalize_fp128(fp128 %a) { +; Z16-LABEL: canonicalize_fp128: +; Z16: # %bb.0: +; Z16-NEXT: larl %r1, .LCPI3_0 +; Z16-NEXT: vl %v0, 0(%r3), 3 +; Z16-NEXT: vl %v1, 0(%r1), 3 +; Z16-NEXT: wfmxb %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 0(%r2), 3 +; Z16-NEXT: br %r14 + %canonicalized = call fp128 @llvm.canonicalize.f128(fp128 %a) + ret fp128 %canonicalized +} + +define void @canonicalize_ptr_f16(ptr %out) nounwind { +; Z16-LABEL: canonicalize_ptr_f16: +; Z16: # %bb.0: +; Z16-NEXT: stmg %r13, %r15, 104(%r15) +; Z16-NEXT: aghi %r15, -160 +; Z16-NEXT: vlreph %v0, 0(%r2) +; Z16-NEXT: lgr %r13, %r2 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: vsteh %v0, 0(%r13), 0 +; Z16-NEXT: lmg %r13, %r15, 264(%r15) +; Z16-NEXT: br %r14 + %val = load half, ptr %out + %canonicalized = call half @llvm.canonicalize.f16(half %val) + store half %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_f32(ptr %out) { +; Z16-LABEL: canonicalize_ptr_f32: +; Z16: # %bb.0: +; Z16-NEXT: vgmf %v0, 2, 8 +; Z16-NEXT: meeb %f0, 0(%r2) +; Z16-NEXT: ste %f0, 0(%r2) +; Z16-NEXT: br %r14 + %val = load float, ptr %out + %canonicalized = call float @llvm.canonicalize.f32(float %val) + store float %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_f64(ptr %out) { +; Z16-LABEL: canonicalize_ptr_f64: +; Z16: # %bb.0: +; Z16-NEXT: vgmg %v0, 2, 11 +; Z16-NEXT: mdb %f0, 0(%r2) +; Z16-NEXT: std %f0, 0(%r2) +; Z16-NEXT: br %r14 + %val = load double, ptr %out + %canonicalized = call double @llvm.canonicalize.f64(double %val) + store double %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_f128(ptr %out) { +; Z16-LABEL: canonicalize_ptr_f128: +; Z16: # %bb.0: +; Z16-NEXT: larl %r1, .LCPI7_0 +; Z16-NEXT: vl %v0, 0(%r2), 3 +; Z16-NEXT: vl %v1, 0(%r1), 3 +; Z16-NEXT: wfmxb %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 0(%r2), 3 +; Z16-NEXT: br %r14 + %val = load fp128, ptr %out + %canonicalized = call fp128 @llvm.canonicalize.f128(fp128 %val) + store fp128 %canonicalized, ptr %out + ret void +} +define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind { +; Z16-LABEL: canonicalize_v8f16: +; Z16: # %bb.0: +; Z16-NEXT: stmg %r13, %r15, 104(%r15) +; Z16-NEXT: aghi %r15, -224 +; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill +; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill +; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill +; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill +; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill +; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill +; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill +; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill +; Z16-NEXT: vlreph %v11, 414(%r15) +; Z16-NEXT: vlreph %v12, 406(%r15) +; Z16-NEXT: vlreph %v13, 398(%r15) +; Z16-NEXT: vlreph %v14, 390(%r15) +; Z16-NEXT: ldr %f8, %f6 +; Z16-NEXT: ldr %f9, %f4 +; Z16-NEXT: ldr %f10, %f2 +; Z16-NEXT: lgr %r13, %r2 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f15, %f0 +; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f10, %f0 +; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f9, %f0 +; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f8, %f0 +; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f14, %f0 +; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f13, %f0 +; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f12, %f0 +; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: vsteh %v0, 14(%r13), 0 +; Z16-NEXT: vsteh %v12, 12(%r13), 0 +; Z16-NEXT: vsteh %v13, 10(%r13), 0 +; Z16-NEXT: vsteh %v14, 8(%r13), 0 +; Z16-NEXT: vsteh %v8, 6(%r13), 0 +; Z16-NEXT: vsteh %v9, 4(%r13), 0 +; Z16-NEXT: vsteh %v10, 2(%r13), 0 +; Z16-NEXT: vsteh %v15, 0(%r13), 0 +; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; Z16-NEXT: lmg %r13, %r15, 328(%r15) +; Z16-NEXT: br %r14 + %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %a) + ret <8 x half> %canonicalized +} +define <4 x float> @canonicalize_v4f32(<4 x float> %a) { +; Z16-LABEL: canonicalize_v4f32: +; Z16: # %bb.0: +; Z16-NEXT: vrepf %v0, %v24, 3 +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: vrepf %v2, %v24, 2 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: meebr %f2, %f1 +; Z16-NEXT: vrepf %v3, %v24, 1 +; Z16-NEXT: vmrhf %v0, %v2, %v0 +; Z16-NEXT: wfmsb %f2, %v24, %f1 +; Z16-NEXT: wfmsb %f1, %f3, %f1 +; Z16-NEXT: vmrhf %v1, %v2, %v1 +; Z16-NEXT: vmrhg %v24, %v1, %v0 +; Z16-NEXT: br %r14 + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) + ret <4 x float> %canonicalized +} + +define <4 x double> @canonicalize_v4f64(<4 x double> %a) { +; Z16-LABEL: canonicalize_v4f64: +; Z16: # %bb.0: +; Z16-NEXT: vgmg %v0, 2, 11 +; Z16-NEXT: vrepg %v2, %v24, 1 +; Z16-NEXT: wfmdb %f1, %v24, %f0 +; Z16-NEXT: mdbr %f2, %f0 +; Z16-NEXT: vmrhg %v24, %v1, %v2 +; Z16-NEXT: vrepg %v2, %v26, 1 +; Z16-NEXT: wfmdb %f1, %v26, %f0 +; Z16-NEXT: wfmdb %f0, %f2, %f0 +; Z16-NEXT: vmrhg %v26, %v1, %v0 +; Z16-NEXT: br %r14 + %canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %a) + ret <4 x double> %canonicalized +} + +define <2 x fp128> @canonicalize_v2f128(<2 x fp128> %a) { +; Z16-LABEL: canonicalize_v2f128: +; Z16: # %bb.0: +; Z16-NEXT: larl %r1, .LCPI11_0 +; Z16-NEXT: vl %v0, 16(%r3), 3 +; Z16-NEXT: vl %v1, 0(%r3), 3 +; Z16-NEXT: vl %v2, 0(%r1), 3 +; Z16-NEXT: wfmxb %v1, %v1, %v2 +; Z16-NEXT: wfmxb %v0, %v0, %v2 +; Z16-NEXT: vst %v0, 16(%r2), 4 +; Z16-NEXT: vst %v1, 0(%r2), 4 +; Z16-NEXT: br %r14 + %canonicalized = call <2 x fp128> @llvm.canonicalize.v2f128(<2 x fp128> %a) + ret <2 x fp128> %canonicalized +} + +define void @canonicalize_ptr_v8f16(ptr %out) nounwind { +; Z16-LABEL: canonicalize_ptr_v8f16: +; Z16: # %bb.0: +; Z16-NEXT: stmg %r13, %r15, 104(%r15) +; Z16-NEXT: aghi %r15, -224 +; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill +; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill +; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill +; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill +; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill +; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill +; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill +; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill +; Z16-NEXT: vlreph %v0, 0(%r2) +; Z16-NEXT: vlreph %v8, 14(%r2) +; Z16-NEXT: vlreph %v9, 12(%r2) +; Z16-NEXT: vlreph %v10, 10(%r2) +; Z16-NEXT: lgr %r13, %r2 +; Z16-NEXT: vlreph %v11, 8(%r2) +; Z16-NEXT: vlreph %v12, 6(%r2) +; Z16-NEXT: vlreph %v13, 4(%r2) +; Z16-NEXT: vlreph %v14, 2(%r2) +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f15, %f0 +; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f14, %f0 +; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f13, %f0 +; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f12, %f0 +; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f11, %f0 +; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f10, %f0 +; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: ldr %f9, %f0 +; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: brasl %r14, __extendhfsf2@PLT +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: meebr %f0, %f1 +; Z16-NEXT: brasl %r14, __truncsfhf2@PLT +; Z16-NEXT: vsteh %v9, 12(%r13), 0 +; Z16-NEXT: vsteh %v10, 10(%r13), 0 +; Z16-NEXT: vsteh %v11, 8(%r13), 0 +; Z16-NEXT: vsteh %v12, 6(%r13), 0 +; Z16-NEXT: vsteh %v13, 4(%r13), 0 +; Z16-NEXT: vsteh %v14, 2(%r13), 0 +; Z16-NEXT: vsteh %v15, 0(%r13), 0 +; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; Z16-NEXT: vsteh %v0, 14(%r13), 0 +; Z16-NEXT: lmg %r13, %r15, 328(%r15) +; Z16-NEXT: br %r14 + %val = load <8 x half>, ptr %out + %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %val) + store <8 x half> %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_v4f32(ptr %out) { +; Z16-LABEL: canonicalize_ptr_v4f32: +; Z16: # %bb.0: +; Z16-NEXT: vl %v0, 0(%r2), 3 +; Z16-NEXT: vrepf %v1, %v0, 3 +; Z16-NEXT: vgmf %v2, 2, 8 +; Z16-NEXT: vrepf %v3, %v0, 2 +; Z16-NEXT: meebr %f1, %f2 +; Z16-NEXT: meebr %f3, %f2 +; Z16-NEXT: vmrhf %v1, %v3, %v1 +; Z16-NEXT: wfmsb %f3, %f0, %f2 +; Z16-NEXT: vrepf %v0, %v0, 1 +; Z16-NEXT: meebr %f0, %f2 +; Z16-NEXT: vmrhf %v0, %v3, %v0 +; Z16-NEXT: vmrhg %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 0(%r2), 3 +; Z16-NEXT: br %r14 + %val = load <4 x float>, ptr %out + %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %val) + store <4 x float> %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_v4f64(ptr %out) { +; Z16-LABEL: canonicalize_ptr_v4f64: +; Z16: # %bb.0: +; Z16-NEXT: vl %v1, 16(%r2), 4 +; Z16-NEXT: vgmg %v2, 2, 11 +; Z16-NEXT: wfmdb %f3, %f1, %f2 +; Z16-NEXT: vrepg %v1, %v1, 1 +; Z16-NEXT: mdbr %f1, %f2 +; Z16-NEXT: vl %v0, 0(%r2), 4 +; Z16-NEXT: vmrhg %v1, %v3, %v1 +; Z16-NEXT: wfmdb %f3, %f0, %f2 +; Z16-NEXT: vrepg %v0, %v0, 1 +; Z16-NEXT: mdbr %f0, %f2 +; Z16-NEXT: vmrhg %v0, %v3, %v0 +; Z16-NEXT: vst %v0, 0(%r2), 4 +; Z16-NEXT: vst %v1, 16(%r2), 4 +; Z16-NEXT: br %r14 + %val = load <4 x double>, ptr %out + %canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %val) + store <4 x double> %canonicalized, ptr %out + ret void +} + +define void @canonicalize_ptr_v2f128(ptr %out) { +; Z16-LABEL: canonicalize_ptr_v2f128: +; Z16: # %bb.0: +; Z16-NEXT: larl %r1, .LCPI15_0 +; Z16-NEXT: vl %v0, 16(%r2), 4 +; Z16-NEXT: vl %v1, 0(%r2), 4 +; Z16-NEXT: vl %v2, 0(%r1), 3 +; Z16-NEXT: wfmxb %v1, %v1, %v2 +; Z16-NEXT: wfmxb %v0, %v0, %v2 +; Z16-NEXT: vst %v0, 16(%r2), 4 +; Z16-NEXT: vst %v1, 0(%r2), 4 +; Z16-NEXT: br %r14 + %val = load <2 x fp128>, ptr %out + %canonicalized = call <2 x fp128> @llvm.canonicalize.v2f128(<2 x fp128> %val) + store <2 x fp128> %canonicalized, ptr %out + ret void +}