@@ -3730,6 +3730,8 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_can
37303730# include < immintrin.h>
37313731# elif defined(__SSE2__)
37323732# include < emmintrin.h>
3733+ # elif defined(__riscv_vector)
3734+ # include < riscv_vector.h>
37333735# endif
37343736#endif
37353737
@@ -3852,6 +3854,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
38523854 */
38533855 XXH_VSX = 5 , /* !< VSX and ZVector for POWER8/z13 (64-bit) */
38543856 XXH_SVE = 6 , /* !< SVE for some ARMv8-A and ARMv9-A */
3857+ XXH_RVV = 7 , /* !< RVV for Riscv */
38553858};
38563859/* !
38573860 * @ingroup tuning
@@ -3874,6 +3877,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
38743877# define XXH_NEON 4
38753878# define XXH_VSX 5
38763879# define XXH_SVE 6
3880+ # define XXH_RVV 7
38773881#endif
38783882
38793883#ifndef XXH_VECTOR /* can be defined on command line */
@@ -3898,6 +3902,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
38983902 || (defined (__s390x__) && defined(__VEC__)) \
38993903 && defined(__GNUC__) /* TODO: IBM XL */
39003904# define XXH_VECTOR XXH_VSX
3905+ # elif defined(__riscv_vector)
3906+ # define XXH_VECTOR XXH_RVV
39013907# else
39023908# define XXH_VECTOR XXH_SCALAR
39033909# endif
@@ -3935,6 +3941,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
39353941# define XXH_ACC_ALIGN 64
39363942# elif XXH_VECTOR == XXH_SVE /* sve */
39373943# define XXH_ACC_ALIGN 64
3944+ # elif XXH_VECTOR == XXH_RVV /* rvv */
3945+ # define XXH_ACC_ALIGN 64 /* could be 8, but 64 may be faster */
39383946# endif
39393947#endif
39403948
@@ -3943,6 +3951,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
39433951# define XXH_SEC_ALIGN XXH_ACC_ALIGN
39443952#elif XXH_VECTOR == XXH_SVE
39453953# define XXH_SEC_ALIGN XXH_ACC_ALIGN
3954+ #elif XXH_VECTOR == XXH_RVV
3955+ # define XXH_SEC_ALIGN XXH_ACC_ALIGN
39463956#else
39473957# define XXH_SEC_ALIGN 8
39483958#endif
@@ -5601,6 +5611,132 @@ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
56015611
56025612#endif
56035613
5614+ #if (XXH_VECTOR == XXH_RVV)
5615+ #define XXH_CONCAT2 (X, Y ) X ## Y
5616+ #define XXH_CONCAT (X, Y ) XXH_CONCAT2(X, Y)
5617+ #if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \
5618+ (defined (__clang__) && __clang_major__ < 16 ))
5619+ #define XXH_RVOP (op ) op
5620+ #define XXH_RVCAST (op ) XXH_CONCAT(vreinterpret_v_, op)
5621+ #else
5622+ #define XXH_RVOP (op ) XXH_CONCAT(__riscv_, op)
5623+ #define XXH_RVCAST (op ) XXH_CONCAT(__riscv_vreinterpret_v_, op)
5624+ #endif
5625+ XXH_FORCE_INLINE void
5626+ XXH3_accumulate_512_rvv ( void * XXH_RESTRICT acc,
5627+ const void * XXH_RESTRICT input,
5628+ const void * XXH_RESTRICT secret)
5629+ {
5630+ XXH_ASSERT ((((size_t )acc) & 63 ) == 0 );
5631+ {
5632+ // Try to set vector lenght to 512 bits.
5633+ // If this length is unavailable, then maximum available will be used
5634+ size_t vl = XXH_RVOP (vsetvl_e64m2)(8 );
5635+
5636+ uint64_t * xacc = (uint64_t *) acc;
5637+ const uint64_t * xinput = (const uint64_t *) input;
5638+ const uint64_t * xsecret = (const uint64_t *) secret;
5639+ static const uint64_t swap_mask[16 ] = {1 , 0 , 3 , 2 , 5 , 4 , 7 , 6 , 9 , 8 , 11 , 10 , 13 , 12 , 15 , 14 };
5640+ vuint64m2_t xswap_mask = XXH_RVOP (vle64_v_u64m2)(swap_mask, vl);
5641+
5642+ size_t i;
5643+ for (i = 0 ; i < XXH_STRIPE_LEN/8 ; i += vl) {
5644+ /* data_vec = xinput[i]; */
5645+ vuint64m2_t data_vec = XXH_RVCAST (u8m2_u64m2)(XXH_RVOP (vle8_v_u8m2)((const uint8_t *)(xinput + i), vl * 8 ));
5646+ /* key_vec = xsecret[i]; */
5647+ vuint64m2_t key_vec = XXH_RVCAST (u8m2_u64m2)(XXH_RVOP (vle8_v_u8m2)((const uint8_t *)(xsecret + i), vl * 8 ));
5648+ /* acc_vec = xacc[i]; */
5649+ vuint64m2_t acc_vec = XXH_RVOP (vle64_v_u64m2)(xacc + i, vl);
5650+ /* data_key = data_vec ^ key_vec; */
5651+ vuint64m2_t data_key = XXH_RVOP (vxor_vv_u64m2)(data_vec, key_vec, vl);
5652+ /* data_key_hi = data_key >> 32; */
5653+ vuint64m2_t data_key_hi = XXH_RVOP (vsrl_vx_u64m2)(data_key, 32 , vl);
5654+ /* data_key_lo = data_key & 0xffffffff; */
5655+ vuint64m2_t data_key_lo = XXH_RVOP (vand_vx_u64m2)(data_key, 0xffffffff , vl);
5656+ /* swap high and low halves */
5657+ vuint64m2_t data_swap = XXH_RVOP (vrgather_vv_u64m2)(data_vec, xswap_mask, vl);
5658+ /* acc_vec += data_key_lo * data_key_hi; */
5659+ acc_vec = XXH_RVOP (vmacc_vv_u64m2)(acc_vec, data_key_lo, data_key_hi, vl);
5660+ /* acc_vec += data_swap; */
5661+ acc_vec = XXH_RVOP (vadd_vv_u64m2)(acc_vec, data_swap, vl);
5662+ /* xacc[i] = acc_vec; */
5663+ XXH_RVOP (vse64_v_u64m2)(xacc + i, acc_vec, vl);
5664+ }
5665+ }
5666+ }
5667+
5668+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE (rvv)
5669+
5670+ XXH_FORCE_INLINE void
5671+ XXH3_scrambleAcc_rvv(void * XXH_RESTRICT acc, const void * XXH_RESTRICT secret)
5672+ {
5673+ XXH_ASSERT ((((size_t )acc) & 15 ) == 0 );
5674+ {
5675+ size_t count = XXH_STRIPE_LEN/8 ;
5676+ uint64_t * xacc = (uint64_t *)acc;
5677+ const uint8_t * xsecret = (const uint8_t *)secret;
5678+ size_t vl;
5679+ for (; count > 0 ; count -= vl, xacc += vl, xsecret += vl*8 ) {
5680+ vl = XXH_RVOP (vsetvl_e64m2)(count);
5681+ {
5682+ /* key_vec = xsecret[i]; */
5683+ vuint64m2_t key_vec = XXH_RVCAST (u8m2_u64m2)(XXH_RVOP (vle8_v_u8m2)(xsecret, vl*8 ));
5684+ /* acc_vec = xacc[i]; */
5685+ vuint64m2_t acc_vec = XXH_RVOP (vle64_v_u64m2)(xacc, vl);
5686+ /* acc_vec ^= acc_vec >> 47; */
5687+ vuint64m2_t vsrl = XXH_RVOP (vsrl_vx_u64m2)(acc_vec, 47 , vl);
5688+ acc_vec = XXH_RVOP (vxor_vv_u64m2)(acc_vec, vsrl, vl);
5689+ /* acc_vec ^= key_vec; */
5690+ acc_vec = XXH_RVOP (vxor_vv_u64m2)(acc_vec, key_vec, vl);
5691+ /* acc_vec *= XXH_PRIME32_1; */
5692+ acc_vec = XXH_RVOP (vmul_vx_u64m2)(acc_vec, XXH_PRIME32_1, vl);
5693+ /* xacc[i] *= acc_vec; */
5694+ XXH_RVOP (vse64_v_u64m2)(xacc, acc_vec, vl);
5695+ }
5696+ }
5697+ }
5698+ }
5699+
5700+ XXH_FORCE_INLINE void
5701+ XXH3_initCustomSecret_rvv (void * XXH_RESTRICT customSecret, xxh_u64 seed64)
5702+ {
5703+ XXH_STATIC_ASSERT (XXH_SEC_ALIGN >= 8 );
5704+ XXH_ASSERT (((size_t )customSecret & 7 ) == 0 );
5705+ (void )(&XXH_writeLE64);
5706+ {
5707+ size_t count = XXH_SECRET_DEFAULT_SIZE/8 ;
5708+ size_t vl;
5709+ size_t VLMAX = XXH_RVOP (vsetvlmax_e64m2)();
5710+ int64_t * cSecret = (int64_t *)customSecret;
5711+ const int64_t * kSecret = (const int64_t *)(const void *)XXH3_kSecret;
5712+
5713+ #if __riscv_v_intrinsic >= 1000000
5714+ // ratified v1.0 intrinics version
5715+ vbool32_t mneg = XXH_RVCAST (u8m1_b32)(
5716+ XXH_RVOP (vmv_v_x_u8m1)(0xaa , XXH_RVOP (vsetvlmax_e8m1)()));
5717+ #else
5718+ // support pre-ratification intrinics, which lack mask to vector casts
5719+ size_t vlmax = XXH_RVOP (vsetvlmax_e8m1)();
5720+ vbool32_t mneg = XXH_RVOP (vmseq_vx_u8mf4_b32)(
5721+ XXH_RVOP (vand_vx_u8mf4)(
5722+ XXH_RVOP (vid_v_u8mf4)(vlmax), 1 , vlmax), 1 , vlmax);
5723+ #endif
5724+ vint64m2_t seed = XXH_RVOP (vmv_v_x_i64m2)((int64_t )seed64, VLMAX);
5725+ seed = XXH_RVOP (vneg_v_i64m2_mu)(mneg, seed, seed, VLMAX);
5726+
5727+ for (; count > 0 ; count -= vl, cSecret += vl, kSecret += vl) {
5728+ /* make sure vl=VLMAX until last iteration */
5729+ vl = XXH_RVOP (vsetvl_e64m2)(count < VLMAX ? count : VLMAX);
5730+ {
5731+ vint64m2_t src = XXH_RVOP (vle64_v_i64m2)(kSecret , vl);
5732+ vint64m2_t res = XXH_RVOP (vadd_vv_i64m2)(src, seed, vl);
5733+ XXH_RVOP (vse64_v_i64m2)(cSecret, res, vl);
5734+ }
5735+ }
5736+ }
5737+ }
5738+ #endif
5739+
56045740/* scalar variants - universal */
56055741
56065742#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
@@ -5831,6 +5967,12 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
58315967#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
58325968#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
58335969
5970+ #elif (XXH_VECTOR == XXH_RVV)
5971+ #define XXH3_accumulate_512 XXH3_accumulate_512_rvv
5972+ #define XXH3_accumulate XXH3_accumulate_rvv
5973+ #define XXH3_scrambleAcc XXH3_scrambleAcc_rvv
5974+ #define XXH3_initCustomSecret XXH3_initCustomSecret_rvv
5975+
58345976#else /* scalar */
58355977
58365978#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
0 commit comments