@@ -7390,6 +7390,155 @@ size_t convertSequences_noRepcodes(SeqDef* dstSeqs, const ZSTD_Sequence* inSeqs,
73907390 * but since this implementation is targeting modern systems (>= Sapphire Rapid),
73917391 * it's not useful to develop and maintain code for older pre-AVX2 platforms */
73927392
7393+ #elif defined(ZSTD_ARCH_ARM_SVE2 )
7394+
7395+ /*
7396+ * Checks if any active element in a signed 8-bit integer vector is greater
7397+ * than zero.
7398+ *
7399+ * @param g Governing predicate selecting active lanes.
7400+ * @param a Input vector of signed 8-bit integers.
7401+ *
7402+ * @return True if any active element in `a` is > 0, false otherwise.
7403+ */
7404+ FORCE_INLINE_TEMPLATE int cmpgtz_any_s8 (svbool_t g , svint8_t a )
7405+ {
7406+ svbool_t ptest = svcmpgt_n_s8 (g , a , 0 );
7407+ return svptest_any (ptest , ptest );
7408+ }
7409+
7410+ size_t convertSequences_noRepcodes (
7411+ SeqDef * dstSeqs ,
7412+ const ZSTD_Sequence * inSeqs ,
7413+ size_t nbSequences )
7414+ {
7415+ /* Process the input with `8 * VL / element` lanes. */
7416+ const size_t lanes = 8 * svcntb () / sizeof (ZSTD_Sequence );
7417+ size_t longLen = 0 ;
7418+ size_t n = 0 ;
7419+
7420+ /* SVE permutation depends on the specific definition of target structures. */
7421+ ZSTD_STATIC_ASSERT (sizeof (ZSTD_Sequence ) == 16 );
7422+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , offset ) == 0 );
7423+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , litLength ) == 4 );
7424+ ZSTD_STATIC_ASSERT (offsetof(ZSTD_Sequence , matchLength ) == 8 );
7425+ ZSTD_STATIC_ASSERT (sizeof (SeqDef ) == 8 );
7426+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , offBase ) == 0 );
7427+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , litLength ) == 4 );
7428+ ZSTD_STATIC_ASSERT (offsetof(SeqDef , mlBase ) == 6 );
7429+
7430+ if (nbSequences >= lanes ) {
7431+ const svbool_t ptrue = svptrue_b8 ();
7432+ /* 16-bit of {ZSTD_REP_NUM, 0, -MINMATCH, 0} extended to 32-bit lanes. */
7433+ const svuint32_t vaddition = svreinterpret_u32 (
7434+ svunpklo_s32 (svreinterpret_s16 (svdup_n_u64 (ZSTD_REP_NUM | (((U64 )(U16 )- MINMATCH ) << 32 )))));
7435+ /* For permutation of 16-bit units: 0, 1, 2, 4, 8, 9, 10, 12, ... */
7436+ const svuint16_t vmask = svreinterpret_u16 (
7437+ svindex_u64 (0x0004000200010000 , 0x0008000800080008 ));
7438+ /* Upper bytes of `litLength` and `matchLength` will be packed into the
7439+ * middle of overflow check vector. */
7440+ const svbool_t pmid = svcmpne_n_u8 (
7441+ ptrue , svreinterpret_u8 (svdup_n_u64 (0x0000FFFFFFFF0000 )), 0 );
7442+
7443+ do {
7444+ /* Load `lanes` number of `ZSTD_Sequence` into 8 vectors. */
7445+ const svuint32_t vin0 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 0 );
7446+ const svuint32_t vin1 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 1 );
7447+ const svuint32_t vin2 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 2 );
7448+ const svuint32_t vin3 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 3 );
7449+ const svuint32_t vin4 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 4 );
7450+ const svuint32_t vin5 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 5 );
7451+ const svuint32_t vin6 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 6 );
7452+ const svuint32_t vin7 = svld1_vnum_u32 (ptrue , & inSeqs [n ].offset , 7 );
7453+
7454+ /* Add {ZSTD_REP_NUM, 0, -MINMATCH, 0} to each structures. */
7455+ const svuint16x2_t vadd01 = svcreate2_u16 (
7456+ svreinterpret_u16 (svadd_u32_x (ptrue , vin0 , vaddition )),
7457+ svreinterpret_u16 (svadd_u32_x (ptrue , vin1 , vaddition )));
7458+ const svuint16x2_t vadd23 = svcreate2_u16 (
7459+ svreinterpret_u16 (svadd_u32_x (ptrue , vin2 , vaddition )),
7460+ svreinterpret_u16 (svadd_u32_x (ptrue , vin3 , vaddition )));
7461+ const svuint16x2_t vadd45 = svcreate2_u16 (
7462+ svreinterpret_u16 (svadd_u32_x (ptrue , vin4 , vaddition )),
7463+ svreinterpret_u16 (svadd_u32_x (ptrue , vin5 , vaddition )));
7464+ const svuint16x2_t vadd67 = svcreate2_u16 (
7465+ svreinterpret_u16 (svadd_u32_x (ptrue , vin6 , vaddition )),
7466+ svreinterpret_u16 (svadd_u32_x (ptrue , vin7 , vaddition )));
7467+
7468+ /* Shuffle and pack bytes so each vector contains SeqDef structures. */
7469+ const svuint16_t vout01 = svtbl2_u16 (vadd01 , vmask );
7470+ const svuint16_t vout23 = svtbl2_u16 (vadd23 , vmask );
7471+ const svuint16_t vout45 = svtbl2_u16 (vadd45 , vmask );
7472+ const svuint16_t vout67 = svtbl2_u16 (vadd67 , vmask );
7473+
7474+ /* Pack the upper 16-bits of 32-bit lanes for overflow check. */
7475+ const svuint16_t voverflow01 = svuzp2_u16 (svget2_u16 (vadd01 , 0 ),
7476+ svget2_u16 (vadd01 , 1 ));
7477+ const svuint16_t voverflow23 = svuzp2_u16 (svget2_u16 (vadd23 , 0 ),
7478+ svget2_u16 (vadd23 , 1 ));
7479+ const svuint16_t voverflow45 = svuzp2_u16 (svget2_u16 (vadd45 , 0 ),
7480+ svget2_u16 (vadd45 , 1 ));
7481+ const svuint16_t voverflow67 = svuzp2_u16 (svget2_u16 (vadd67 , 0 ),
7482+ svget2_u16 (vadd67 , 1 ));
7483+
7484+ /* We don't need the whole 16 bits of the overflow part. Only 1 bit
7485+ * is needed, so we pack tightly and merge multiple vectors to be
7486+ * able to use a single comparison to handle the overflow case.
7487+ * However, we also need to handle the possible negative values of
7488+ * matchLength parts, so we use signed comparison later. */
7489+ const svint8_t voverflow =
7490+ svmax_s8_x (pmid ,
7491+ svtrn1_s8 (svreinterpret_s8 (voverflow01 ),
7492+ svreinterpret_s8 (voverflow23 )),
7493+ svtrn1_s8 (svreinterpret_s8 (voverflow45 ),
7494+ svreinterpret_s8 (voverflow67 )));
7495+
7496+ /* Store `lanes` number of `SeqDef` structures from 4 vectors. */
7497+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 0 , svreinterpret_u32 (vout01 ));
7498+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 1 , svreinterpret_u32 (vout23 ));
7499+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 2 , svreinterpret_u32 (vout45 ));
7500+ svst1_vnum_u32 (ptrue , & dstSeqs [n ].offBase , 3 , svreinterpret_u32 (vout67 ));
7501+
7502+ /* Check if any enabled lanes of the overflow vector is larger than
7503+ * zero, only one such may happen. */
7504+ if (UNLIKELY (cmpgtz_any_s8 (pmid , voverflow ))) {
7505+ /* Scalar search for long match is needed because we merged
7506+ * multiple overflow bytes with `max`. */
7507+ size_t i ;
7508+ for (i = n ; i < n + lanes ; i ++ ) {
7509+ if (inSeqs [i ].matchLength > 65535 + MINMATCH ) {
7510+ assert (longLen == 0 );
7511+ longLen = i + 1 ;
7512+ }
7513+ if (inSeqs [i ].litLength > 65535 ) {
7514+ assert (longLen == 0 );
7515+ longLen = i + nbSequences + 1 ;
7516+ }
7517+ }
7518+ }
7519+
7520+ n += lanes ;
7521+ } while (n <= nbSequences - lanes );
7522+ }
7523+
7524+ /* Handle remaining elements. */
7525+ for (; n < nbSequences ; n ++ ) {
7526+ dstSeqs [n ].offBase = OFFSET_TO_OFFBASE (inSeqs [n ].offset );
7527+ dstSeqs [n ].litLength = (U16 )inSeqs [n ].litLength ;
7528+ dstSeqs [n ].mlBase = (U16 )(inSeqs [n ].matchLength - MINMATCH );
7529+ /* Check for long length > 65535. */
7530+ if (UNLIKELY (inSeqs [n ].matchLength > 65535 + MINMATCH )) {
7531+ assert (longLen == 0 );
7532+ longLen = n + 1 ;
7533+ }
7534+ if (UNLIKELY (inSeqs [n ].litLength > 65535 )) {
7535+ assert (longLen == 0 );
7536+ longLen = n + nbSequences + 1 ;
7537+ }
7538+ }
7539+ return longLen ;
7540+ }
7541+
73937542#elif defined(ZSTD_ARCH_ARM_NEON ) && (defined(__aarch64__ ) || defined(_M_ARM64 ))
73947543
73957544size_t convertSequences_noRepcodes (
0 commit comments