1414
1515#include "field_5x52_int128_impl.h"
1616
17+ #ifdef X86
18+ # include <immintrin.h>
19+ #endif
20+
1721#ifdef VERIFY
1822static void secp256k1_fe_impl_verify (const secp256k1_fe * a ) {
1923 const uint64_t * d = a -> n ;
@@ -37,10 +41,15 @@ static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
3741 const uint64_t bound1 = 0xFFFFFFFFFFFFFULL * two_m ;
3842 const uint64_t bound2 = 0x0FFFFFFFFFFFFULL * two_m ;
3943
44+ #ifdef __AVX2__
45+ __m256i vec = _mm256_set1_epi64x (bound1 );
46+ _mm256_storeu_si256 ((__m256i * )r -> n , vec );
47+ #else
4048 r -> n [0 ] = bound1 ;
4149 r -> n [1 ] = bound1 ;
4250 r -> n [2 ] = bound1 ;
4351 r -> n [3 ] = bound1 ;
52+ #endif
4453 r -> n [4 ] = bound2 ;
4554}
4655
@@ -209,6 +218,7 @@ SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) {
209218
210219SECP256K1_INLINE static int secp256k1_fe_impl_is_zero (const secp256k1_fe * a ) {
211220 const uint64_t * t = a -> n ;
221+ /* TODO: parallelize. mm_testz */
212222 return (t [0 ] | t [1 ] | t [2 ] | t [3 ] | t [4 ]) == 0 ;
213223}
214224
@@ -239,6 +249,8 @@ static void secp256k1_fe_impl_set_b32_mod(secp256k1_fe *r, const unsigned char *
239249 limbs [3 ] = BYTESWAP_64 (limbs [3 ]);
240250#endif
241251
252+ /* TODO: parallelize avx2 */
253+
242254 r -> n [0 ] = (limbs [3 ] & 0xFFFFFFFFFFFFFULL );
243255 r -> n [1 ] = (limbs [3 ] >> 52 ) | ((limbs [2 ] & 0xFFFFFFFFFFULL ) << 12 );
244256 r -> n [2 ] = (limbs [2 ] >> 40 ) | ((limbs [1 ] & 0xFFFFFFFULL ) << 24 );
@@ -291,6 +303,10 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
291303}
292304
293305SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked (secp256k1_fe * r , const secp256k1_fe * a , int m ) {
306+ #ifdef __AVX2__
307+ /* load here to mitigate load latency */
308+ __m256i vec_a = _mm256_loadu_si256 ((__m256i * )a -> n );
309+ #endif
294310 const uint32_t two_m1 = 2 * (m + 1 );
295311 const uint64_t bound1 = 0xFFFFEFFFFFC2FULL * two_m1 ;
296312 const uint64_t bound2 = 0xFFFFFFFFFFFFFULL * two_m1 ;
@@ -303,10 +319,18 @@ SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r,
303319
304320 /* Due to the properties above, the left hand in the subtractions below is never less than
305321 * the right hand. */
322+ #ifdef __AVX2__
323+ {
324+ __m256i vec_bounds = _mm256_setr_epi64x (bound1 , bound2 , bound2 , bound2 );
325+ __m256i out = _mm256_sub_epi64 (vec_bounds , vec_a );
326+ _mm256_storeu_si256 ((__m256i * )r -> n , out );
327+ }
328+ #else
306329 r -> n [0 ] = bound1 - a -> n [0 ];
307330 r -> n [1 ] = bound2 - a -> n [1 ];
308331 r -> n [2 ] = bound2 - a -> n [2 ];
309332 r -> n [3 ] = bound2 - a -> n [3 ];
333+ #endif
310334 r -> n [4 ] = bound3 - a -> n [4 ];
311335}
312336
@@ -339,15 +363,32 @@ SECP256K1_INLINE static void secp256k1_fe_impl_sqr(secp256k1_fe *r, const secp25
339363}
340364
341365SECP256K1_INLINE static void secp256k1_fe_impl_cmov (secp256k1_fe * r , const secp256k1_fe * a , int flag ) {
366+ #ifdef __AVX2__
367+ /* load here to mitigate load latency */
368+ __m256i vec_r = _mm256_loadu_si256 ((__m256i * )(r -> n ));
369+ __m256i vec_a = _mm256_loadu_si256 ((__m256i * )(a -> n ));
370+ #endif
371+
342372 uint64_t mask0 , mask1 ;
343373 volatile int vflag = flag ;
344374 SECP256K1_CHECKMEM_CHECK_VERIFY (r -> n , sizeof (r -> n ));
345375 mask0 = vflag + ~((uint64_t )0 );
346376 mask1 = ~mask0 ;
377+
378+ #ifdef __AVX2__
379+ {
380+ __m256i vec_mask0 = _mm256_set1_epi64x (mask0 );
381+ __m256i vec_mask1 = _mm256_set1_epi64x (mask1 );
382+ vec_r = _mm256_and_si256 (vec_r , vec_mask0 );
383+ vec_a = _mm256_and_si256 (vec_a , vec_mask1 );
384+ _mm256_storeu_si256 ((__m256i * )r -> n , _mm256_or_si256 (vec_r , vec_a ));
385+ }
386+ #else
347387 r -> n [0 ] = (r -> n [0 ] & mask0 ) | (a -> n [0 ] & mask1 );
348388 r -> n [1 ] = (r -> n [1 ] & mask0 ) | (a -> n [1 ] & mask1 );
349389 r -> n [2 ] = (r -> n [2 ] & mask0 ) | (a -> n [2 ] & mask1 );
350390 r -> n [3 ] = (r -> n [3 ] & mask0 ) | (a -> n [3 ] & mask1 );
391+ #endif
351392 r -> n [4 ] = (r -> n [4 ] & mask0 ) | (a -> n [4 ] & mask1 );
352393}
353394
@@ -418,19 +459,42 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
418459}
419460
420461static void secp256k1_fe_impl_to_storage (secp256k1_fe_storage * r , const secp256k1_fe * a ) {
462+ #ifdef __AVX2__
463+ __m256i limbs_0123 = _mm256_loadu_si256 ((__m256i * )a -> n );
464+ __m256i limbs_1234 = _mm256_loadu_si256 ((__m256i * )(a -> n + 1 ));
465+ const __m256i shift_lhs = _mm256_setr_epi64x (0 , 12 , 24 , 36 ); /* TODO: precompute */
466+ const __m256i shift_rhs = _mm256_setr_epi64x (52 , 40 , 28 , 16 ); /* TODO: precompute */
467+ __m256i rhs = _mm256_sllv_epi64 (limbs_1234 , shift_rhs );
468+ __m256i lhs = _mm256_srlv_epi64 (limbs_0123 , shift_lhs );
469+ _mm256_storeu_si256 ((__m256i * )r -> n , _mm256_or_si256 (lhs , rhs ));
470+ #else
421471 r -> n [0 ] = a -> n [0 ] | a -> n [1 ] << 52 ;
422472 r -> n [1 ] = a -> n [1 ] >> 12 | a -> n [2 ] << 40 ;
423473 r -> n [2 ] = a -> n [2 ] >> 24 | a -> n [3 ] << 28 ;
424474 r -> n [3 ] = a -> n [3 ] >> 36 | a -> n [4 ] << 16 ;
475+ #endif
425476}
426477
427478static SECP256K1_INLINE void secp256k1_fe_impl_from_storage (secp256k1_fe * r , const secp256k1_fe_storage * a ) {
428479 const uint64_t a0 = a -> n [0 ], a1 = a -> n [1 ], a2 = a -> n [2 ], a3 = a -> n [3 ];
429480
481+ #ifdef __AVX2__
482+ {
483+ __m256i limbs_0123 = _mm256_setr_epi64x (a0 , a1 , a2 , a3 );
484+ __m256i limbs_0012 = _mm256_setr_epi64x (a0 , a0 , a1 , a2 );
485+ const __m256i shift_lhs = _mm256_setr_epi64x (64 , 52 , 40 , 28 ); /* TODO: precompute */
486+ const __m256i shift_rhs = _mm256_setr_epi64x (0 , 12 , 24 , 36 ); /* TODO: precompute */
487+ const __m256i mask52 = _mm256_set1_epi64x (0xFFFFFFFFFFFFFULL ); /* TODO: precompute */
488+ __m256i rhs = _mm256_and_si256 (_mm256_sllv_epi64 (limbs_0123 , shift_rhs ), mask52 );
489+ __m256i lhs = _mm256_srlv_epi64 (limbs_0012 , shift_lhs );
490+ _mm256_storeu_si256 ((__m256i * )r -> n , _mm256_or_si256 (lhs , rhs ));
491+ }
492+ #else
430493 r -> n [0 ] = a0 & 0xFFFFFFFFFFFFFULL ;
431494 r -> n [1 ] = a0 >> 52 | ((a1 << 12 ) & 0xFFFFFFFFFFFFFULL );
432495 r -> n [2 ] = a1 >> 40 | ((a2 << 24 ) & 0xFFFFFFFFFFFFFULL );
433496 r -> n [3 ] = a2 >> 28 | ((a3 << 36 ) & 0xFFFFFFFFFFFFFULL );
497+ #endif
434498 r -> n [4 ] = a3 >> 16 ;
435499}
436500
@@ -447,21 +511,49 @@ static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64
447511 VERIFY_CHECK (a3 >> 62 == 0 );
448512 VERIFY_CHECK (a4 >> 8 == 0 );
449513
514+ #ifdef __AVX2__
515+ {
516+ __m256i limbs_0123 = _mm256_setr_epi64x (a0 , a1 , a2 , a3 );
517+ __m256i limbs_0012 = _mm256_setr_epi64x (a0 , a0 , a1 , a2 );
518+ const __m256i shift_lhs = _mm256_setr_epi64x (64 , 52 , 42 , 32 ); /*TODO: precompute */
519+ const __m256i shift_rhs = _mm256_setr_epi64x (0 , 10 , 20 , 30 ); /*TODO: precompute */
520+ const __m256i mask52 = _mm256_set1_epi64x (M52 ); /*TODO: precompute */
521+ __m256i rhs = _mm256_sllv_epi64 (limbs_0123 , shift_rhs );
522+ __m256i lhs = _mm256_srlv_epi64 (limbs_0012 , shift_lhs );
523+ __m256i out = _mm256_or_si256 (lhs , rhs );
524+ _mm256_storeu_si256 ((__m256i * )r -> n , _mm256_and_si256 (out , mask52 ));
525+ }
526+ #else
450527 r -> n [0 ] = a0 & M52 ;
451528 r -> n [1 ] = (a0 >> 52 | a1 << 10 ) & M52 ;
452529 r -> n [2 ] = (a1 >> 42 | a2 << 20 ) & M52 ;
453530 r -> n [3 ] = (a2 >> 32 | a3 << 30 ) & M52 ;
531+ #endif
454532 r -> n [4 ] = (a3 >> 22 | a4 << 40 );
455533}
456534
457535static void secp256k1_fe_to_signed62 (secp256k1_modinv64_signed62 * r , const secp256k1_fe * a ) {
458536 const uint64_t M62 = UINT64_MAX >> 2 ;
459537 const uint64_t a0 = a -> n [0 ], a1 = a -> n [1 ], a2 = a -> n [2 ], a3 = a -> n [3 ], a4 = a -> n [4 ];
460538
539+ #ifdef __AVX2__
540+ {
541+ __m256i limbs_0123 = _mm256_setr_epi64x (a0 , a1 , a2 , a3 );
542+ __m256i limbs_1234 = _mm256_setr_epi64x (a1 , a2 , a3 , a4 );
543+ const __m256i shift_lhs = _mm256_setr_epi64x (0 , 10 , 20 , 30 ); /*TODO: precompute */
544+ const __m256i shift_rhs = _mm256_setr_epi64x (52 , 42 , 32 , 22 ); /*TODO: precompute */
545+ const __m256i mask62 = _mm256_set1_epi64x (M62 ); /*TODO: precompute */
546+ __m256i lhs = _mm256_srlv_epi64 (limbs_0123 , shift_lhs );
547+ __m256i rhs = _mm256_sllv_epi64 (limbs_1234 , shift_rhs );
548+ __m256i out = _mm256_or_si256 (lhs , rhs );
549+ _mm256_storeu_si256 ((__m256i * )r -> v , _mm256_and_si256 (out , mask62 ));
550+ }
551+ #else
461552 r -> v [0 ] = (a0 | a1 << 52 ) & M62 ;
462553 r -> v [1 ] = (a1 >> 10 | a2 << 42 ) & M62 ;
463554 r -> v [2 ] = (a2 >> 20 | a3 << 32 ) & M62 ;
464555 r -> v [3 ] = (a3 >> 30 | a4 << 22 ) & M62 ;
556+ #endif
465557 r -> v [4 ] = a4 >> 40 ;
466558}
467559
0 commit comments