diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index daf898ff37217f..fc07b06796466f 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -3132,6 +3132,7 @@ class Compiler
#if defined(TARGET_ARM64)
GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize);
+ GenTree* gtNewSimdFalseMaskByteNode(unsigned simdSize);
#endif
GenTree* gtNewSimdBinOpNode(genTreeOps op,
@@ -6691,6 +6692,15 @@ class Compiler
GenTree* fgMorphHWIntrinsic(GenTreeHWIntrinsic* tree);
GenTree* fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node);
GenTree* fgOptimizeHWIntrinsicAssociative(GenTreeHWIntrinsic* node);
+#if defined(FEATURE_MASKED_HW_INTRINSICS)
+ GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node);
+#endif // FEATURE_MASKED_HW_INTRINSICS
+#ifdef TARGET_ARM64
+ bool canMorphVectorOperandToMask(GenTree* node);
+ bool canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node);
+ GenTree* doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent);
+ GenTreeHWIntrinsic* fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node);
+#endif // TARGET_ARM64
#endif // FEATURE_HW_INTRINSICS
GenTree* fgOptimizeCommutativeArithmetic(GenTreeOp* tree);
GenTree* fgOptimizeRelationalComparisonWithCasts(GenTreeOp* cmp);
diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h
index d936d579d8e25a..470ab85650bf7c 100644
--- a/src/coreclr/jit/hwintrinsic.h
+++ b/src/coreclr/jit/hwintrinsic.h
@@ -233,6 +233,11 @@ enum HWIntrinsicFlag : unsigned int
// The intrinsic is a reduce operation.
HW_Flag_ReduceOperation = 0x2000000,
+ // This intrinsic could be implemented with another intrinsic when it is operating on operands that are all of
+ // type TYP_MASK, and this other intrinsic will produces a value of this type. Used in morph to convert vector
+ // operations into mask operations when the intrinsic is operating on mask vectors (mainly bitwise operations).
+ HW_Flag_HasAllMaskVariant = 0x4000000,
+
#else
#error Unsupported platform
#endif
@@ -1133,6 +1138,67 @@ struct HWIntrinsicInfo
}
}
+#ifdef FEATURE_MASKED_HW_INTRINSICS
+ // HasAllMaskVariant: Does the intrinsic have an intrinsic variant that operates on mask types?
+ //
+ // Arguments:
+ // id -- the intrinsic to check for a mask-type variant.
+ //
+ // Return Value:
+ // true when the intrinsic has a mask-type variant, else false
+ //
+ static bool HasAllMaskVariant(NamedIntrinsic id)
+ {
+ const HWIntrinsicFlag flags = lookupFlags(id);
+ return (flags & HW_Flag_HasAllMaskVariant) != 0;
+ }
+
+ // GetMaskVariant: Given an intrinsic that has a variant that operates on mask types, return the ID of
+ // this variant intrinsic. Call HasAllMaskVariant before using this function, as it will
+ // assert if no match is found.
+ //
+ // Arguments:
+ // id -- the intrinsic with a mask-type variant.
+ //
+ // Return Value:
+ // The ID of the mask-type variant for the given intrinsic
+ //
+ static NamedIntrinsic GetMaskVariant(NamedIntrinsic id)
+ {
+ assert(HasAllMaskVariant(id));
+ switch (id)
+ {
+ case NI_Sve_And:
+ return NI_Sve_And_Predicates;
+ case NI_Sve_BitwiseClear:
+ return NI_Sve_BitwiseClear_Predicates;
+ case NI_Sve_Xor:
+ return NI_Sve_Xor_Predicates;
+ case NI_Sve_Or:
+ return NI_Sve_Or_Predicates;
+ case NI_Sve_ZipHigh:
+ return NI_Sve_ZipHigh_Predicates;
+ case NI_Sve_ZipLow:
+ return NI_Sve_ZipLow_Predicates;
+ case NI_Sve_UnzipOdd:
+ return NI_Sve_UnzipOdd_Predicates;
+ case NI_Sve_UnzipEven:
+ return NI_Sve_UnzipEven_Predicates;
+ case NI_Sve_TransposeEven:
+ return NI_Sve_TransposeEven_Predicates;
+ case NI_Sve_TransposeOdd:
+ return NI_Sve_TransposeOdd_Predicates;
+ case NI_Sve_ReverseElement:
+ return NI_Sve_ReverseElement_Predicates;
+ case NI_Sve_ConditionalSelect:
+ return NI_Sve_ConditionalSelect_Predicates;
+
+ default:
+ unreached();
+ }
+ }
+#endif // FEATURE_MASKED_HW_INTRINSICS
+
#endif // TARGET_ARM64
static bool HasSpecialSideEffect(NamedIntrinsic id)
diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp
index af6c1bb935ae81..06b6b1edf77aa6 100644
--- a/src/coreclr/jit/hwintrinsicarm64.cpp
+++ b/src/coreclr/jit/hwintrinsicarm64.cpp
@@ -3341,7 +3341,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
}
//------------------------------------------------------------------------
-// gtNewSimdEmbeddedMaskNode: Create an embedded mask
+// gtNewSimdAllTrueMaskNode: Create an embedded mask with all bits set to true
//
// Arguments:
// simdBaseJitType -- the base jit type of the nodes being masked
@@ -3355,4 +3355,18 @@ GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigne
return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateTrueMaskAll, simdBaseJitType, simdSize);
}
+//------------------------------------------------------------------------
+// gtNewSimdFalseMaskByteNode: Create an embedded mask with all bits set to false
+//
+// Arguments:
+// simdSize -- the simd size of the nodes being masked
+//
+// Return Value:
+// The mask
+//
+GenTree* Compiler::gtNewSimdFalseMaskByteNode(unsigned simdSize)
+{
+ return gtNewSimdHWIntrinsicNode(TYP_MASK, NI_Sve_CreateFalseMaskByte, CORINFO_TYPE_UBYTE, simdSize);
+}
+
#endif // FEATURE_HW_INTRINSICS
diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
index b03b492fa4045e..bdf319aeb5b460 100644
--- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
+++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp
@@ -702,6 +702,14 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}
+ case NI_Sve_And_Predicates:
+ case NI_Sve_BitwiseClear_Predicates:
+ case NI_Sve_Or_Predicates:
+ case NI_Sve_Xor_Predicates:
+ GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg,
+ embMaskOp2Reg, INS_OPTS_SCALABLE_B);
+ break;
+
default:
{
GetEmitter()->emitIns_R_R_R_R(insEmbMask, emitSize, targetReg, maskReg, embMaskOp1Reg,
@@ -2478,6 +2486,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_Sve_CreateBreakAfterPropagateMask:
case NI_Sve_CreateBreakBeforePropagateMask:
+ case NI_Sve_ConditionalSelect_Predicates:
{
GetEmitter()->emitInsSve_R_R_R_R(ins, emitSize, targetReg, op1Reg, op2Reg, op3Reg, INS_OPTS_SCALABLE_B);
break;
diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h
index 54f797376eca04..7038e725808909 100644
--- a/src/coreclr/jit/hwintrinsiclistarm64sve.h
+++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h
@@ -27,9 +27,9 @@ HARDWARE_INTRINSIC(Sve, AddAcross,
HARDWARE_INTRINSIC(Sve, AddRotateComplex, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcadd, INS_sve_fcadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand)
HARDWARE_INTRINSIC(Sve, AddSaturate, -1, 2, {INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable)
HARDWARE_INTRINSIC(Sve, AddSequentialAcross, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fadda, INS_sve_fadda}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_ReduceOperation)
-HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, AndAcross, -1, -1, {INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation)
-HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, BooleanNot, -1, -1, {INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Compact, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, CompareEqual, -1, -1, {INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_fcmeq, INS_sve_fcmeq}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation)
@@ -47,7 +47,7 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElement,
HARDWARE_INTRINSIC(Sve, ConditionalExtractAfterLastActiveElementAndReplicate, -1, 3, {INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta, INS_sve_clasta}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics)
HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElement, -1, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasScalarInputVariant|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementAndReplicate, -1, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics)
-HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment)
+HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, ConvertToDouble, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_fcvt, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ConvertToInt32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ConvertToInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
@@ -224,7 +224,7 @@ HARDWARE_INTRINSIC(Sve, MultiplyExtended,
HARDWARE_INTRINSIC(Sve, MultiplySubtract, -1, -1, {INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, Negate, -1, -1, {INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_fneg, INS_sve_fneg}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Not, -1, -1, {INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation)
-HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation)
HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PrefetchBytes, -1, 3, {INS_invalid, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_SpecialSideEffect_Other)
@@ -237,7 +237,7 @@ HARDWARE_INTRINSIC(Sve, ReciprocalSqrtEstimate,
HARDWARE_INTRINSIC(Sve, ReciprocalSqrtStep, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_frsqrts, INS_sve_frsqrts}, HW_Category_SIMD, HW_Flag_Scalable)
HARDWARE_INTRINSIC(Sve, ReciprocalStep, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_frecps, INS_sve_frecps}, HW_Category_SIMD, HW_Flag_Scalable)
HARDWARE_INTRINSIC(Sve, ReverseBits, -1, -1, {INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_sve_rbit, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
-HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, ReverseElement16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ReverseElement32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revw, INS_sve_revw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ReverseElement8, -1, -1, {INS_invalid, INS_invalid, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_sve_revb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
@@ -284,23 +284,23 @@ HARDWARE_INTRINSIC(Sve, SubtractSaturate,
HARDWARE_INTRINSIC(Sve, TestAnyTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, TestFirstTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, TestLastTrue, -1, 2, {INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_sve_ptest, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(Sve, TransposeEven, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(Sve, TransposeOdd, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, TransposeEven, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
+HARDWARE_INTRINSIC(Sve, TransposeOdd, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, TrigonometricMultiplyAddCoefficient, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftmad, INS_sve_ftmad}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, TrigonometricSelectCoefficient, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftssel, INS_sve_ftssel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, TrigonometricStartingValue, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ftsmul, INS_sve_ftsmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Sve, UnzipEven, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(Sve, UnzipOdd, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, UnzipEven, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
+HARDWARE_INTRINSIC(Sve, UnzipOdd, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, VectorTableLookup, -1, 2, {INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl}, HW_Category_SIMD, HW_Flag_Scalable)
-HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
+HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant)
HARDWARE_INTRINSIC(Sve, XorAcross, -1, -1, {INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation)
HARDWARE_INTRINSIC(Sve, ZeroExtend16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxth, INS_invalid, INS_sve_uxth, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ZeroExtend32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxtw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ZeroExtend8, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_sve_uxtb, INS_invalid, INS_sve_uxtb, INS_invalid, INS_sve_uxtb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ZeroExtendWideningLower, -1, 1, {INS_invalid, INS_sve_uunpklo, INS_invalid, INS_sve_uunpklo, INS_invalid, INS_sve_uunpklo, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, ZeroExtendWideningUpper, -1, 1, {INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_sve_uunpkhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg)
-HARDWARE_INTRINSIC(Sve, ZipHigh, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(Sve, ZipLow, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, ZipHigh, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
+HARDWARE_INTRINSIC(Sve, ZipLow, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant)
#define LAST_NI_Sve NI_Sve_ZipLow
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
@@ -333,6 +333,19 @@ HARDWARE_INTRINSIC(Sve, SaturatingIncrementBy64BitElementCountScalar,
HARDWARE_INTRINSIC(Sve, StoreAndZipx2, -1, 3, {INS_sve_st2b, INS_sve_st2b, INS_sve_st2h, INS_sve_st2h, INS_sve_st2w, INS_sve_st2w, INS_sve_st2d, INS_sve_st2d, INS_sve_st2w, INS_sve_st2d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(Sve, StoreAndZipx3, -1, 3, {INS_sve_st3b, INS_sve_st3b, INS_sve_st3h, INS_sve_st3h, INS_sve_st3w, INS_sve_st3w, INS_sve_st3d, INS_sve_st3d, INS_sve_st3w, INS_sve_st3d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(Sve, StoreAndZipx4, -1, 3, {INS_sve_st4b, INS_sve_st4b, INS_sve_st4h, INS_sve_st4h, INS_sve_st4w, INS_sve_st4w, INS_sve_st4d, INS_sve_st4d, INS_sve_st4w, INS_sve_st4d}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters)
+// Predicate variants of intrinsics, these are specialized for operating on TYP_MASK type values.
+HARDWARE_INTRINSIC(Sve, And_Predicates, -1, 2, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, BitwiseClear_Predicates, -1, 2, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, Or_Predicates, -1, 2, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, Xor_Predicates, -1, 2, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, ConditionalSelect_Predicates, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_ExplicitMaskedOperation|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(Sve, ZipHigh_Predicates, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, ZipLow_Predicates, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, UnzipEven_Predicates, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, UnzipOdd_Predicates, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, TransposeEven_Predicates, -1, 2, {INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1, INS_sve_trn1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, TransposeOdd_Predicates, -1, 2, {INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2, INS_sve_trn2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
+HARDWARE_INTRINSIC(Sve, ReverseElement_Predicates, -1, 1, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask)
#endif // FEATURE_HW_INTRINSIC
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index b52c8e48cbe0e4..37861d2284a2e5 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -9397,151 +9397,11 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
default:
{
#if defined(FEATURE_MASKED_HW_INTRINSICS)
- bool isScalar = false;
- genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
- genTreeOps oper = actualOper;
-
- // We shouldn't find AND_NOT, OR_NOT or XOR_NOT nodes since it should only be produced in lowering
- assert((oper != GT_AND_NOT) && (oper != GT_OR_NOT) && (oper != GT_XOR_NOT));
-
- if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
+ GenTreeHWIntrinsic* maskedIntrinsic = fgOptimizeForMaskedIntrinsic(node);
+ if (maskedIntrinsic != nullptr)
{
- GenTree* op1 = node->Op(1);
-
- GenTree* op2;
- GenTree* actualOp2;
-
- if (oper == GT_NOT)
- {
- op2 = op1;
- actualOp2 = nullptr;
- }
- else
- {
- op2 = node->Op(2);
- actualOp2 = op2;
- }
-
- // We need both operands to be ConvertMaskToVector in
- // order to optimize this to a direct mask operation
-
- if (!op1->OperIsConvertMaskToVector())
- {
- break;
- }
-
- if (!op2->OperIsHWIntrinsic())
- {
-#if defined(TARGET_XARCH)
- if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet())
- {
- break;
- }
-
- // We want to explicitly recognize op1 ^ AllBitsSet as
- // some platforms don't have direct support for ~op1
-
- oper = GT_NOT;
- op2 = op1;
-#else
- break;
-#endif
- }
-
- GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
- GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
-
- if (!cvtOp2->OperIsConvertMaskToVector())
- {
- break;
- }
-
- unsigned simdBaseTypeSize = genTypeSize(simdBaseType);
-
- if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) ||
- (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
- {
- // We need both operands to be the same kind of mask; otherwise
- // the bitwise operation can differ in how it performs
- break;
- }
-
- NamedIntrinsic maskIntrinsicId = NI_Illegal;
-
-#if defined(TARGET_XARCH)
- switch (oper)
- {
- case GT_AND:
- {
- maskIntrinsicId = NI_EVEX_AndMask;
- break;
- }
-
- case GT_NOT:
- {
- maskIntrinsicId = NI_EVEX_NotMask;
- break;
- }
-
- case GT_OR:
- {
- maskIntrinsicId = NI_EVEX_OrMask;
- break;
- }
-
- case GT_XOR:
- {
- maskIntrinsicId = NI_EVEX_XorMask;
- break;
- }
-
- default:
- {
- unreached();
- }
- }
-#elif defined(TARGET_ARM64)
- // TODO-ARM64-CQ: Support transforming bitwise operations on masks
- break;
-#else
-#error Unsupported platform
-#endif // !TARGET_XARCH && !TARGET_ARM64
-
- if (maskIntrinsicId == NI_Illegal)
- {
- break;
- }
-
- if (oper == actualOper)
- {
- node->ChangeHWIntrinsicId(maskIntrinsicId);
- node->Op(1) = cvtOp1->Op(1);
- }
- else
- {
- assert(oper == GT_NOT);
- node->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
- node->gtFlags &= ~GTF_REVERSE_OPS;
- }
-
- node->gtType = TYP_MASK;
- DEBUG_DESTROY_NODE(op1);
-
- if (oper != GT_NOT)
- {
- assert(actualOp2 != nullptr);
- node->Op(2) = cvtOp2->Op(1);
- }
-
- if (actualOp2 != nullptr)
- {
- DEBUG_DESTROY_NODE(actualOp2);
- }
-
+ node = maskedIntrinsic;
node->SetMorphed(this);
- node = gtNewSimdCvtMaskToVectorNode(retType, node, simdBaseJitType, simdSize)->AsHWIntrinsic();
- node->SetMorphed(this);
- return node;
}
#endif // FEATURE_MASKED_HW_INTRINSICS
break;
@@ -9726,6 +9586,284 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node)
return node;
}
+#if defined(FEATURE_MASKED_HW_INTRINSICS)
+//------------------------------------------------------------------------
+// fgOptimizeForMaskedIntrinsic: Tries to recognize intrinsics that are operating
+// on mask types and morphs the tree to use intrinsics
+// better suited to this.
+//
+// Arguments:
+// node - the hardware intrinsic tree to try and optimize.
+// This tree will be mutated if it is possible to optimize the tree.
+//
+// Return Value:
+// The optimized tree, nullptr if no change was made.
+//
+GenTreeHWIntrinsic* Compiler::fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node)
+{
+#if defined(TARGET_XARCH)
+ bool isScalar = false;
+ genTreeOps actualOper = node->GetOperForHWIntrinsicId(&isScalar);
+ genTreeOps oper = actualOper;
+ var_types retType = node->TypeGet();
+ CorInfoType simdBaseJitType = node->GetSimdBaseJitType();
+ var_types simdBaseType = node->GetSimdBaseType();
+ unsigned simdSize = node->GetSimdSize();
+
+ // We shouldn't find AND_NOT, OR_NOT or XOR_NOT nodes since it should only be produced in lowering
+ assert((oper != GT_AND_NOT) && (oper != GT_OR_NOT) && (oper != GT_XOR_NOT));
+
+ if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper))
+ {
+ GenTree* op1 = node->Op(1);
+
+ GenTree* op2;
+ GenTree* actualOp2;
+
+ if (oper == GT_NOT)
+ {
+ op2 = op1;
+ actualOp2 = nullptr;
+ }
+ else
+ {
+ op2 = node->Op(2);
+ actualOp2 = op2;
+ }
+
+ // We need both operands to be ConvertMaskToVector in
+ // order to optimize this to a direct mask operation
+
+ if (!op1->OperIsConvertMaskToVector())
+ {
+ return nullptr;
+ }
+
+ if (!op2->OperIsHWIntrinsic())
+ {
+ if ((oper != GT_XOR) || !op2->IsVectorAllBitsSet())
+ {
+ return nullptr;
+ }
+
+ // We want to explicitly recognize op1 ^ AllBitsSet as
+ // some platforms don't have direct support for ~op1
+
+ oper = GT_NOT;
+ op2 = op1;
+ }
+
+ GenTreeHWIntrinsic* cvtOp1 = op1->AsHWIntrinsic();
+ GenTreeHWIntrinsic* cvtOp2 = op2->AsHWIntrinsic();
+
+ if (!cvtOp2->OperIsConvertMaskToVector())
+ {
+ return nullptr;
+ }
+
+ unsigned simdBaseTypeSize = genTypeSize(node->GetSimdBaseType());
+
+ if ((genTypeSize(cvtOp1->GetSimdBaseType()) != simdBaseTypeSize) ||
+ (genTypeSize(cvtOp2->GetSimdBaseType()) != simdBaseTypeSize))
+ {
+ // We need both operands to be the same kind of mask; otherwise
+ // the bitwise operation can differ in how it performs
+ return nullptr;
+ }
+
+ NamedIntrinsic maskIntrinsicId = NI_Illegal;
+
+ switch (oper)
+ {
+ case GT_AND:
+ {
+ maskIntrinsicId = NI_EVEX_AndMask;
+ break;
+ }
+
+ case GT_NOT:
+ {
+ maskIntrinsicId = NI_EVEX_NotMask;
+ break;
+ }
+
+ case GT_OR:
+ {
+ maskIntrinsicId = NI_EVEX_OrMask;
+ break;
+ }
+
+ case GT_XOR:
+ {
+ maskIntrinsicId = NI_EVEX_XorMask;
+ break;
+ }
+
+ default:
+ {
+ unreached();
+ }
+ }
+
+ if (maskIntrinsicId == NI_Illegal)
+ {
+ return nullptr;
+ }
+
+ if (oper == actualOper)
+ {
+ node->ChangeHWIntrinsicId(maskIntrinsicId);
+ node->Op(1) = cvtOp1->Op(1);
+ }
+ else
+ {
+ assert(oper == GT_NOT);
+ node->ResetHWIntrinsicId(maskIntrinsicId, this, cvtOp1->Op(1));
+ node->gtFlags &= ~GTF_REVERSE_OPS;
+ }
+
+ node->gtType = TYP_MASK;
+ DEBUG_DESTROY_NODE(op1);
+
+ if (oper != GT_NOT)
+ {
+ assert(actualOp2 != nullptr);
+ node->Op(2) = cvtOp2->Op(1);
+ }
+
+ if (actualOp2 != nullptr)
+ {
+ DEBUG_DESTROY_NODE(actualOp2);
+ }
+
+ node->SetMorphed(this);
+ node = gtNewSimdCvtMaskToVectorNode(retType, node, simdBaseJitType, simdSize)->AsHWIntrinsic();
+ node->SetMorphed(this);
+ return node;
+ }
+#elif defined(TARGET_ARM64)
+ return fgMorphTryUseAllMaskVariant(node);
+#else
+#error Unsupported platform
+#endif
+ return nullptr;
+}
+
+#ifdef TARGET_ARM64
+//------------------------------------------------------------------------
+// canMorphVectorOperandToMask: Can this vector operand be converted to a
+// node with type TYP_MASK easily?
+//
+bool Compiler::canMorphVectorOperandToMask(GenTree* node)
+{
+ return varTypeIsMask(node) || node->OperIsConvertMaskToVector() || node->IsVectorZero();
+}
+
+//------------------------------------------------------------------------
+// canMorphAllVectorOperandsToMasks: Can all vector operands to this node
+// be converted to a node with type
+// TYP_MASK easily?
+//
+bool Compiler::canMorphAllVectorOperandsToMasks(GenTreeHWIntrinsic* node)
+{
+ bool allMaskConversions = true;
+ for (size_t i = 1; i <= node->GetOperandCount() && allMaskConversions; i++)
+ {
+ allMaskConversions &= canMorphVectorOperandToMask(node->Op(i));
+ }
+
+ return allMaskConversions;
+}
+
+//------------------------------------------------------------------------
+// doMorphVectorOperandToMask: Morph a vector node that is close to a mask
+// node into a mask node.
+//
+// Return value:
+// The morphed tree, or nullptr if the transform is not applicable.
+//
+GenTree* Compiler::doMorphVectorOperandToMask(GenTree* node, GenTreeHWIntrinsic* parent)
+{
+ if (varTypeIsMask(node))
+ {
+ // Already a mask, nothing to do.
+ return node;
+ }
+ else if (node->OperIsConvertMaskToVector())
+ {
+ // Replace node with op1.
+ return node->AsHWIntrinsic()->Op(1);
+ }
+ else if (node->IsVectorZero())
+ {
+ // Morph the vector of zeroes into mask of zeroes.
+ GenTree* mask = gtNewSimdFalseMaskByteNode(parent->GetSimdSize());
+ mask->SetMorphed(this);
+ return mask;
+ }
+
+ return nullptr;
+}
+
+//-----------------------------------------------------------------------------------------------------
+// fgMorphTryUseAllMaskVariant: For NamedIntrinsics that have a variant where all operands are
+// mask nodes. If all operands to this node are 'suggesting' that they
+// originate closely from a mask, but are of vector types, then morph the
+// operands as appropriate to use mask types instead. 'Suggesting'
+// is defined by the canMorphVectorOperandToMask function.
+//
+// Arguments:
+// tree - The HWIntrinsic to try and optimize.
+//
+// Return Value:
+// The fully morphed tree if a change was made, else nullptr.
+//
+GenTreeHWIntrinsic* Compiler::fgMorphTryUseAllMaskVariant(GenTreeHWIntrinsic* node)
+{
+ if (HWIntrinsicInfo::HasAllMaskVariant(node->GetHWIntrinsicId()))
+ {
+ NamedIntrinsic maskVariant = HWIntrinsicInfo::GetMaskVariant(node->GetHWIntrinsicId());
+
+ // As some intrinsics have many variants, check that the count of operands on the node
+ // matches the number of operands required for the mask variant of the intrinsic. The mask
+ // variant of the intrinsic must have a fixed number of operands.
+ int numArgs = HWIntrinsicInfo::lookupNumArgs(maskVariant);
+ assert(numArgs >= 0);
+ if (node->GetOperandCount() == (size_t)numArgs)
+ {
+ // We're sure it will work at this point, so perform the pattern match on operands.
+ if (canMorphAllVectorOperandsToMasks(node))
+ {
+ switch (node->GetOperandCount())
+ {
+ case 1:
+ node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node));
+ break;
+ case 2:
+ node->ResetHWIntrinsicId(maskVariant, doMorphVectorOperandToMask(node->Op(1), node),
+ doMorphVectorOperandToMask(node->Op(2), node));
+ break;
+ case 3:
+ node->ResetHWIntrinsicId(maskVariant, this, doMorphVectorOperandToMask(node->Op(1), node),
+ doMorphVectorOperandToMask(node->Op(2), node),
+ doMorphVectorOperandToMask(node->Op(3), node));
+ break;
+ default:
+ unreached();
+ }
+
+ node->gtType = TYP_MASK;
+ return node;
+ }
+ }
+ }
+
+ return nullptr;
+}
+#endif // TARGET_ARM64
+
+#endif // FEATURE_MASKED_HW_INTRINSICS
+
//------------------------------------------------------------------------
// fgOptimizeHWIntrinsicAssociative: Morph an associative GenTreeHWIntrinsic tree.
//
diff --git a/src/tests/JIT/opt/SVE/Directory.Build.props b/src/tests/JIT/opt/SVE/Directory.Build.props
new file mode 100644
index 00000000000000..0d03787db07ada
--- /dev/null
+++ b/src/tests/JIT/opt/SVE/Directory.Build.props
@@ -0,0 +1,8 @@
+
+
+
+
+ $(NoWarn);SYSLIB5003
+ true
+
+
\ No newline at end of file
diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs
new file mode 100644
index 00000000000000..6d460adbf4beb1
--- /dev/null
+++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs
@@ -0,0 +1,135 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+using System;
+using System.Numerics;
+using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.Arm;
+using Xunit;
+
+public class PredicateInstructions
+{
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ [Fact]
+ public static void TestPredicateInstructions()
+ {
+ ZipLow();
+ ZipHigh();
+ UnzipOdd();
+ UnzipEven();
+ TransposeOdd();
+ TransposeEven();
+ ReverseElement();
+ And();
+ BitwiseClear();
+ Xor();
+ Or();
+ ConditionalSelect();
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector ZipLow()
+ {
+ //ARM64-FULL-LINE: zip1 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+ return Sve.ZipLow(Vector.Zero, Sve.CreateTrueMaskInt16());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector ZipHigh()
+ {
+ //ARM64-FULL-LINE: zip2 {{p[0-9]+}}.s, {{p[0-9]+}}.s, {{p[0-9]+}}.s
+ return Sve.ZipHigh(Sve.CreateTrueMaskUInt32(), Sve.CreateTrueMaskUInt32());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector UnzipEven()
+ {
+ //ARM64-FULL-LINE: uzp1 {{p[0-9]+}}.b, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.UnzipEven(Sve.CreateTrueMaskSByte(), Vector.Zero);
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector UnzipOdd()
+ {
+ //ARM64-FULL-LINE: uzp2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+ return Sve.UnzipOdd(Sve.CreateTrueMaskInt16(), Sve.CreateFalseMaskInt16());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector TransposeEven()
+ {
+ //ARM64-FULL-LINE: trn1 {{p[0-9]+}}.d, {{p[0-9]+}}.d, {{p[0-9]+}}.d
+ return Sve.TransposeEven(Sve.CreateFalseMaskInt64(), Sve.CreateTrueMaskInt64());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector TransposeOdd()
+ {
+ //ARM64-FULL-LINE: trn2 {{p[0-9]+}}.h, {{p[0-9]+}}.h, {{p[0-9]+}}.h
+ return Sve.TransposeOdd(Vector.Zero, Sve.CreateTrueMaskInt16());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector ReverseElement()
+ {
+ //ARM64-FULL-LINE: rev {{p[0-9]+}}.h, {{p[0-9]+}}.h
+ return Sve.ReverseElement(Sve.CreateTrueMaskInt16());
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector And()
+ {
+ //ARM64-FULL-LINE: and {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.ConditionalSelect(
+ Sve.CreateTrueMaskInt16(),
+ Sve.And(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+ Vector.Zero
+ );
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector BitwiseClear()
+ {
+ //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.ConditionalSelect(
+ Sve.CreateFalseMaskInt16(),
+ Sve.BitwiseClear(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+ Vector.Zero
+ );
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector Xor()
+ {
+ //ARM64-FULL-LINE: eor {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.ConditionalSelect(
+ Sve.CreateTrueMaskInt32(),
+ Sve.Xor(Sve.CreateTrueMaskInt32(), Sve.CreateTrueMaskInt32()),
+ Vector.Zero
+ );
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector Or()
+ {
+ //ARM64-FULL-LINE: orr {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.ConditionalSelect(
+ Sve.CreateTrueMaskInt16(),
+ Sve.Or(Sve.CreateTrueMaskInt16(), Sve.CreateTrueMaskInt16()),
+ Vector.Zero
+ );
+ }
+
+ [MethodImpl(MethodImplOptions.NoInlining)]
+ static Vector ConditionalSelect()
+ {
+ //ARM64-FULL-LINE: sel {{p[0-9]+}}.b, {{p[0-9]+}}, {{p[0-9]+}}.b, {{p[0-9]+}}.b
+ return Sve.ConditionalSelect(
+ Vector.Zero,
+ Sve.CreateFalseMaskInt32(),
+ Sve.CreateTrueMaskInt32()
+ );
+ }
+}
\ No newline at end of file
diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.csproj b/src/tests/JIT/opt/SVE/PredicateInstructions.csproj
new file mode 100644
index 00000000000000..9005125d885bb0
--- /dev/null
+++ b/src/tests/JIT/opt/SVE/PredicateInstructions.csproj
@@ -0,0 +1,15 @@
+
+
+ true
+ true
+ None
+ True
+
+
+
+ true
+
+
+
+
+
\ No newline at end of file