diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index daf898ff37217f..fc07b06796466f 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3132,6 +3132,7 @@ class Compiler #if defined(TARGET_ARM64) GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); + GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize); #endif GenTree* gtNewSimdBinOpNode(genTreeOps op, @@ -6691,6 +6692,15 @@ class Compiler GenTree* fgMorphHWIntrinsic(GenTreeHWIntrinsic* tree); GenTree* fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node); GenTree* fgOptimizeHWIntrinsicAssociative(GenTreeHWIntrinsic* node); +#if defined(FEATURE_MASKED_HW_INTRINSICS) + GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node); +#endif // FEATURE_MASKED_HW_INTRINSICS +#ifdef TARGET_ARM64 + bool canMorphVectorOperandToMask(GenTree* node); + bool canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node); + GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent); + GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node); +#endif // TARGET_ARM64 #endif // FEATURE_HW_INTRINSICS GenTree* fgOptimizeCommutativeArithmetic(GenTreeOp* tree); GenTree* fgOptimizeRelationalComparisonWithCasts(GenTreeOp* cmp); diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index d936d579d8e25a..470ab85650bf7c 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -233,6 +233,11 @@ enum HWIntrinsicFlag : unsigned int // The intrinsic is a reduce operation. HW_Flag_ReduceOperation = 0x2000000, + // This intrinsic could be implemented with another intrinsic when it is operating on operands that are all of + // type TYP_MASK, and this other intrinsic will produces a value of this type. Used in morph to convert vector + // operations into mask operations when the intrinsic is operating on mask vectors (mainly bitwise operations). + HW_Flag_HasAllMaskVariant = 0x4000000, + #else #error Unsupported platform #endif @@ -1133,6 +1138,67 @@ struct HWIntrinsicInfo } } +#ifdef FEATURE_MASKED_HW_INTRINSICS + // HasAllMaskVariant: Does the intrinsic have an intrinsic variant that operates on mask types? + // + // Arguments: + // id -- the intrinsic to check for a mask-type variant. + // + // Return Value: + // true when the intrinsic has a mask-type variant, else false + // + static bool HasAllMaskVariant(NamedIntrinsic id) + { + const HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_HasAllMaskVariant) != 0; + } + + // GetMaskVariant: Given an intrinsic that has a variant that operates on mask types, return the ID of + // this variant intrinsic. Call HasAllMaskVariant before using this function, as it will + // assert if no match is found. + // + // Arguments: + // id -- the intrinsic with a mask-type variant. + // + // Return Value: + // The ID of the mask-type variant for the given intrinsic + // + static NamedIntrinsic GetMaskVariant(NamedIntrinsic id) + { + assert(HasAllMaskVariant(id)); + switch (id) + { + case NI_Sve_And: + return NI_Sve_And_Predicates; + case NI_Sve_BitwiseClear: + return NI_Sve_BitwiseClear_Predicates; + case NI_Sve_Xor: + return NI_Sve_Xor_Predicates; + case NI_Sve_Or: + return NI_Sve_Or_Predicates; + case NI_Sve_ZipHigh: + return NI_Sve_ZipHigh_Predicates; + case NI_Sve_ZipLow: + return NI_Sve_ZipLow_Predicates; + case NI_Sve_UnzipOdd: + return NI_Sve_UnzipOdd_Predicates; + case NI_Sve_UnzipEven: + return NI_Sve_UnzipEven_Predicates; + case NI_Sve_TransposeEven: + return NI_Sve_TransposeEven_Predicates; + case NI_Sve_TransposeOdd: + return NI_Sve_TransposeOdd_Predicates; + case NI_Sve_ReverseElement: + return NI_Sve_ReverseElement_Predicates; + case NI_Sve_ConditionalSelect: + return NI_Sve_ConditionalSelect_Predicates; + + default: + unreached(); + } + } +#endif // FEATURE_MASKED_HW_INTRINSICS + #endif // TARGET_ARM64 static bool HasSpecialSideEffect(NamedIntrinsic id) diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index af6c1bb935ae81..06b6b1edf77aa6 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -3341,7 +3341,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } //------------------------------------------------------------------------ -// gtNewSimdEmbeddedMaskNode: Create an embedded mask +// gtNewSimdAllTrueMaskNode: Create an embedded mask with all bits set to true // // Arguments: // simdBaseJitType -- the base jit type of the nodes being masked @@ -3355,4 +3355,18 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize); } +//------------------------------------------------------------------------ +// gtNewSimdFalseMaskByteNode: Create an embedded mask with all bits set to false +// +// Arguments: +// simdSize -- the simd size of the nodes being masked +// +// Return Value: +// The mask +// +GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize) +{ + return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateFalseMaskByte, CORINFO_TYPE_UBYTE, simdSize); +} + #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index b03b492fa4045e..bdf319aeb5b460 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -702,6 +702,14 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve_And_Predicates: + case NI_Sve_BitwiseClear_Predicates: + case NI_Sve_Or_Predicates: + case NI_Sve_Xor_Predicates: + GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg, + embMaskOp2Reg, INS_OPTS_SCALABLE_B); + break; + default: { GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg, @@ -2478,6 +2486,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_CreateBreakAfterPropagateMask: case NI_Sve_CreateBreakBeforePropagateMask: + case NI_Sve_ConditionalSelect_Predicates: { GetEmitter()->emitInsSve_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, INS_OPTS_SCALABLE_B); break; diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 54f797376eca04..7038e725808909 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -27,9 +27,9 @@ HARDWARE_INTRINSIC(Sve, AddAcross, HARDWARE_INTRINSIC(Sve, AddRotateComplex, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcadd, INS_sve_fcadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, AddSaturate, -1, 2, {INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, AddSequentialAcross, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fadda, INS_sve_fadda}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_ReduceOperation) -HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, AndAcross, -1, -1, {INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) -HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, BooleanNot, -1, -1, {INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Compact, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, CompareEqual, -1, -1, {INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_fcmeq, INS_sve_fcmeq}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation) @@ -47,7 +47,7 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElement, HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElementAndReplicate, -1, 3, {INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElement, -1, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasScalarInputVariant|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementAndReplicate, -1, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) -HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment) +HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, ConvertToDouble, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_fcvt, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertToInt32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertToInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -224,7 +224,7 @@ HARDWARE_INTRINSIC(Sve, MultiplyExtended, HARDWARE_INTRINSIC(Sve, MultiplySubtract, -1, -1, {INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, Negate, -1, -1, {INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_fneg, INS_sve_fneg}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Not, -1, -1, {INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation) -HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, PrefetchBytes, -1, 3, {INS_invalid, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_SpecialSideEffect_Other) @@ -237,7 +237,7 @@ HARDWARE_INTRINSIC(Sve, ReciprocalSqrtEstimate, HARDWARE_INTRINSIC(Sve, ReciprocalSqrtStep, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_frsqrts, INS_sve_frsqrts}, HW_Category_SIMD, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, ReciprocalStep, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_frecps, INS_sve_frecps}, HW_Category_SIMD, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, ReverseBits, -1, -1, {INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) -HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, ReverseElement16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ReverseElement32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revw, INS_sve_revw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ReverseElement8, -1, -1, {INS_invalid, INS_invalid, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -284,23 +284,23 @@ HARDWARE_INTRINSIC(Sve, SubtractSaturate, HARDWARE_INTRINSIC(Sve, TestAnyTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, TestFirstTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, TestLastTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, TransposeEven, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, TransposeOdd, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, TransposeEven, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, TransposeOdd, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, TrigonometricMultiplyAddCoefficient, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftmad, INS_sve_ftmad}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, TrigonometricSelectCoefficient, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftssel, INS_sve_ftssel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Sve, TrigonometricStartingValue, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftsmul, INS_sve_ftsmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Sve, UnzipEven, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, UnzipOdd, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, UnzipEven, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, UnzipOdd, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, VectorTableLookup, -1, 2, {INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl}, HW_Category_SIMD, HW_Flag_Scalable) -HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, XorAcross, -1, -1, {INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, ZeroExtend16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxth, INS_invalid, INS_sve_uxth, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ZeroExtend32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxtw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ZeroExtend8, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_uxtb, INS_invalid, INS_sve_uxtb, INS_invalid, INS_sve_uxtb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ZeroExtendWideningLower, -1, 1, {INS_invalid, INS_sve_uunpklo, INS_invalid, INS_sve_uunpklo, INS_invalid, INS_sve_uunpklo, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Sve, ZeroExtendWideningUpper, -1, 1, {INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(Sve, ZipHigh, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, ZipLow, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, ZipHigh, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, ZipLow, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) #define LAST_NI_Sve NI_Sve_ZipLow // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** @@ -333,6 +333,19 @@ HARDWARE_INTRINSIC(Sve, SaturatingIncrementBy64BitElementCountScalar, HARDWARE_INTRINSIC(Sve, StoreAndZipx2, -1, 3, {INS_sve_st2b, INS_sve_st2b, INS_sve_st2h, INS_sve_st2h, INS_sve_st2w, INS_sve_st2w, INS_sve_st2d, INS_sve_st2d, INS_sve_st2w, INS_sve_st2d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters) HARDWARE_INTRINSIC(Sve, StoreAndZipx3, -1, 3, {INS_sve_st3b, INS_sve_st3b, INS_sve_st3h, INS_sve_st3h, INS_sve_st3w, INS_sve_st3w, INS_sve_st3d, INS_sve_st3d, INS_sve_st3w, INS_sve_st3d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters) HARDWARE_INTRINSIC(Sve, StoreAndZipx4, -1, 3, {INS_sve_st4b, INS_sve_st4b, INS_sve_st4h, INS_sve_st4h, INS_sve_st4w, INS_sve_st4w, INS_sve_st4d, INS_sve_st4d, INS_sve_st4w, INS_sve_st4d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters) +// Predicate variants of intrinsics, these are specialized for operating on TYP_MASK type values. +HARDWARE_INTRINSIC(Sve, And_Predicates, -1, 2, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, BitwiseClear_Predicates, -1, 2, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, Or_Predicates, -1, 2, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, Xor_Predicates, -1, 2, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, ConditionalSelect_Predicates, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_ExplicitMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, ZipHigh_Predicates, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, ZipLow_Predicates, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, UnzipEven_Predicates, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, UnzipOdd_Predicates, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, TransposeEven_Predicates, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, TransposeOdd_Predicates, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Sve, ReverseElement_Predicates, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index b52c8e48cbe0e4..37861d2284a2e5 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9397,151 +9397,11 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) default: { #if defined(FEATURE_MASKED_HW_INTRINSICS) - bool isScalar = false; - genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar); - genTreeOps oper = actualOper; - - // We shouldn't find AND_NOT, OR_NOT or XOR_NOT nodes since it should only be produced in lowering - assert((oper != GT_AND_NOT) && (oper != GT_OR_NOT) && (oper != GT_XOR_NOT)); - - if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) + GenTreeHWIntrinsic* maskedIntrinsic = fgOptimizeForMaskedIntrinsic(node); + if (maskedIntrinsic != nullptr) { - GenTree* op1 = node->Op(1); - - GenTree* op2; - GenTree* actualOp2; - - if (oper == GT_NOT) - { - op2 = op1; - actualOp2 = nullptr; - } - else - { - op2 = node->Op(2); - actualOp2 = op2; - } - - // We need both operands to be ConvertMaskToVector in - // order to optimize this to a direct mask operation - - if (!op1->OperIsConvertMaskToVector()) - { - break; - } - - if (!op2->OperIsHWIntrinsic()) - { -#if defined(TARGET_XARCH) - if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet()) - { - break; - } - - // We want to explicitly recognize op1 ^ AllBitsSet as - // some platforms don't have direct support for ~op1 - - oper = GT_NOT; - op2 = op1; -#else - break; -#endif - } - - GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic(); - GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic(); - - if (!cvtOp2->OperIsConvertMaskToVector()) - { - break; - } - - unsigned simdBaseTypeSize = genTypeSize(simdBaseType); - - if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) || - (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize)) - { - // We need both operands to be the same kind of mask; otherwise - // the bitwise operation can differ in how it performs - break; - } - - NamedIntrinsic maskIntrinsicId = NI_Illegal; - -#if defined(TARGET_XARCH) - switch (oper) - { - case GT_AND: - { - maskIntrinsicId = NI_EVEX_AndMask; - break; - } - - case GT_NOT: - { - maskIntrinsicId = NI_EVEX_NotMask; - break; - } - - case GT_OR: - { - maskIntrinsicId = NI_EVEX_OrMask; - break; - } - - case GT_XOR: - { - maskIntrinsicId = NI_EVEX_XorMask; - break; - } - - default: - { - unreached(); - } - } -#elif defined(TARGET_ARM64) - // TODO-ARM64-CQ: Support transforming bitwise operations on masks - break; -#else -#error Unsupported platform -#endif // !TARGET_XARCH && !TARGET_ARM64 - - if (maskIntrinsicId == NI_Illegal) - { - break; - } - - if (oper == actualOper) - { - node->ChangeHWIntrinsicId(maskIntrinsicId); - node->Op(1) = cvtOp1->Op(1); - } - else - { - assert(oper == GT_NOT); - node->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1)); - node->gtFlags &= ~GTF_REVERSE_OPS; - } - - node->gtType = TYP_MASK; - DEBUG_DESTROY_NODE(op1); - - if (oper != GT_NOT) - { - assert(actualOp2 != nullptr); - node->Op(2) = cvtOp2->Op(1); - } - - if (actualOp2 != nullptr) - { - DEBUG_DESTROY_NODE(actualOp2); - } - + node = maskedIntrinsic; node->SetMorphed(this); - node = gtNewSimdCvtMaskToVectorNode(retType, node, simdBaseJitType, simdSize)->AsHWIntrinsic(); - node->SetMorphed(this); - return node; } #endif // FEATURE_MASKED_HW_INTRINSICS break; @@ -9726,6 +9586,284 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) return node; } +#if defined(FEATURE_MASKED_HW_INTRINSICS) +//------------------------------------------------------------------------ +// fgOptimizeForMaskedIntrinsic: Tries to recognize intrinsics that are operating +// on mask types and morphs the tree to use intrinsics +// better suited to this. +// +// Arguments: +// node - the hardware intrinsic tree to try and optimize. +// This tree will be mutated if it is possible to optimize the tree. +// +// Return Value: +// The optimized tree, nullptr if no change was made. +// +GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node) +{ +#if defined(TARGET_XARCH) + bool isScalar = false; + genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar); + genTreeOps oper = actualOper; + var_types retType = node->TypeGet(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + // We shouldn't find AND_NOT, OR_NOT or XOR_NOT nodes since it should only be produced in lowering + assert((oper != GT_AND_NOT) && (oper != GT_OR_NOT) && (oper != GT_XOR_NOT)); + + if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) + { + GenTree* op1 = node->Op(1); + + GenTree* op2; + GenTree* actualOp2; + + if (oper == GT_NOT) + { + op2 = op1; + actualOp2 = nullptr; + } + else + { + op2 = node->Op(2); + actualOp2 = op2; + } + + // We need both operands to be ConvertMaskToVector in + // order to optimize this to a direct mask operation + + if (!op1->OperIsConvertMaskToVector()) + { + return nullptr; + } + + if (!op2->OperIsHWIntrinsic()) + { + if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet()) + { + return nullptr; + } + + // We want to explicitly recognize op1 ^ AllBitsSet as + // some platforms don't have direct support for ~op1 + + oper = GT_NOT; + op2 = op1; + } + + GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic(); + GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic(); + + if (!cvtOp2->OperIsConvertMaskToVector()) + { + return nullptr; + } + + unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType()); + + if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) || + (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize)) + { + // We need both operands to be the same kind of mask; otherwise + // the bitwise operation can differ in how it performs + return nullptr; + } + + NamedIntrinsic maskIntrinsicId = NI_Illegal; + + switch (oper) + { + case GT_AND: + { + maskIntrinsicId = NI_EVEX_AndMask; + break; + } + + case GT_NOT: + { + maskIntrinsicId = NI_EVEX_NotMask; + break; + } + + case GT_OR: + { + maskIntrinsicId = NI_EVEX_OrMask; + break; + } + + case GT_XOR: + { + maskIntrinsicId = NI_EVEX_XorMask; + break; + } + + default: + { + unreached(); + } + } + + if (maskIntrinsicId == NI_Illegal) + { + return nullptr; + } + + if (oper == actualOper) + { + node->ChangeHWIntrinsicId(maskIntrinsicId); + node->Op(1) = cvtOp1->Op(1); + } + else + { + assert(oper == GT_NOT); + node->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1)); + node->gtFlags &= ~GTF_REVERSE_OPS; + } + + node->gtType = TYP_MASK; + DEBUG_DESTROY_NODE(op1); + + if (oper != GT_NOT) + { + assert(actualOp2 != nullptr); + node->Op(2) = cvtOp2->Op(1); + } + + if (actualOp2 != nullptr) + { + DEBUG_DESTROY_NODE(actualOp2); + } + + node->SetMorphed(this); + node = gtNewSimdCvtMaskToVectorNode(retType, node, simdBaseJitType, simdSize)->AsHWIntrinsic(); + node->SetMorphed(this); + return node; + } +#elif defined(TARGET_ARM64) + return fgMorphTryUseAllMaskVariant(node); +#else +#error Unsupported platform +#endif + return nullptr; +} + +#ifdef TARGET_ARM64 +//------------------------------------------------------------------------ +// canMorphVectorOperandToMask: Can this vector operand be converted to a +// node with type TYP_MASK easily? +// +bool Compiler::canMorphVectorOperandToMask(GenTree* node) +{ + return varTypeIsMask(node) || node->OperIsConvertMaskToVector() || node->IsVectorZero(); +} + +//------------------------------------------------------------------------ +// canMorphAllVectorOperandsToMasks: Can all vector operands to this node +// be converted to a node with type +// TYP_MASK easily? +// +bool Compiler::canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node) +{ + bool allMaskConversions = true; + for (size_t i = 1; i <= node->GetOperandCount() && allMaskConversions; i++) + { + allMaskConversions &= canMorphVectorOperandToMask(node->Op(i)); + } + + return allMaskConversions; +} + +//------------------------------------------------------------------------ +// doMorphVectorOperandToMask: Morph a vector node that is close to a mask +// node into a mask node. +// +// Return value: +// The morphed tree, or nullptr if the transform is not applicable. +// +GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent) +{ + if (varTypeIsMask(node)) + { + // Already a mask, nothing to do. + return node; + } + else if (node->OperIsConvertMaskToVector()) + { + // Replace node with op1. + return node->AsHWIntrinsic()->Op(1); + } + else if (node->IsVectorZero()) + { + // Morph the vector of zeroes into mask of zeroes. + GenTree* mask = gtNewSimdFalseMaskByteNode(parent->GetSimdSize()); + mask->SetMorphed(this); + return mask; + } + + return nullptr; +} + +//----------------------------------------------------------------------------------------------------- +// fgMorphTryUseAllMaskVariant: For NamedIntrinsics that have a variant where all operands are +// mask nodes. If all operands to this node are 'suggesting' that they +// originate closely from a mask, but are of vector types, then morph the +// operands as appropriate to use mask types instead. 'Suggesting' +// is defined by the canMorphVectorOperandToMask function. +// +// Arguments: +// tree - The HWIntrinsic to try and optimize. +// +// Return Value: +// The fully morphed tree if a change was made, else nullptr. +// +GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node) +{ + if (HWIntrinsicInfo::HasAllMaskVariant(node->GetHWIntrinsicId())) + { + NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(node->GetHWIntrinsicId()); + + // As some intrinsics have many variants, check that the count of operands on the node + // matches the number of operands required for the mask variant of the intrinsic. The mask + // variant of the intrinsic must have a fixed number of operands. + int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant); + assert(numArgs >= 0); + if (node->GetOperandCount() == (size_t)numArgs) + { + // We're sure it will work at this point, so perform the pattern match on operands. + if (canMorphAllVectorOperandsToMasks(node)) + { + switch (node->GetOperandCount()) + { + case 1: + node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node)); + break; + case 2: + node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node), + doMorphVectorOperandToMask(node->Op(2), node)); + break; + case 3: + node->ResetHWIntrinsicId(maskVariant, this, doMorphVectorOperandToMask(node->Op(1), node), + doMorphVectorOperandToMask(node->Op(2), node), + doMorphVectorOperandToMask(node->Op(3), node)); + break; + default: + unreached(); + } + + node->gtType = TYP_MASK; + return node; + } + } + } + + return nullptr; +} +#endif // TARGET_ARM64 + +#endif // FEATURE_MASKED_HW_INTRINSICS + //------------------------------------------------------------------------ // fgOptimizeHWIntrinsicAssociative: Morph an associative GenTreeHWIntrinsic tree. // diff --git a/src/tests/JIT/opt/SVE/Directory.Build.props b/src/tests/JIT/opt/SVE/Directory.Build.props new file mode 100644 index 00000000000000..0d03787db07ada --- /dev/null +++ b/src/tests/JIT/opt/SVE/Directory.Build.props @@ -0,0 +1,8 @@ + + + + + $(NoWarn);SYSLIB5003 + true + + \ No newline at end of file diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs new file mode 100644 index 00000000000000..6d460adbf4beb1 --- /dev/null +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -0,0 +1,135 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using Xunit; + +public class PredicateInstructions +{ + [MethodImpl(MethodImplOptions.NoInlining)] + [Fact] + public static void TestPredicateInstructions() + { + ZipLow(); + ZipHigh(); + UnzipOdd(); + UnzipEven(); + TransposeOdd(); + TransposeEven(); + ReverseElement(); + And(); + BitwiseClear(); + Xor(); + Or(); + ConditionalSelect(); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipLow() + { + //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ZipHigh() + { + //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s + return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipEven() + { + //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector UnzipOdd() + { + //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeEven() + { + //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d + return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector TransposeOdd() + { + //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ReverseElement() + { + //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h + return Sve.ReverseElement(Sve.CreateTrueMaskInt16()); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector And() + { + //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector BitwiseClear() + { + //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.ConditionalSelect( + Sve.CreateFalseMaskInt16(), + Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector Xor() + { + //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt32(), + Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()), + Vector.Zero + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector Or() + { + //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.ConditionalSelect( + Sve.CreateTrueMaskInt16(), + Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()), + Vector.Zero + ); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector ConditionalSelect() + { + //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b + return Sve.ConditionalSelect( + Vector.Zero, + Sve.CreateFalseMaskInt32(), + Sve.CreateTrueMaskInt32() + ); + } +} \ No newline at end of file diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.csproj b/src/tests/JIT/opt/SVE/PredicateInstructions.csproj new file mode 100644 index 00000000000000..9005125d885bb0 --- /dev/null +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.csproj @@ -0,0 +1,15 @@ + + + true + true + None + True + + + + true + + + + + \ No newline at end of file