Skip to content

Commit d4bde68

Browse files
committed
Add intel simd
1 parent 497af2e commit d4bde68

File tree

4 files changed

+195
-13
lines changed

4 files changed

+195
-13
lines changed

src/field_5x52_impl.h

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414

1515
#include "field_5x52_int128_impl.h"
1616

17+
#ifdef X86
18+
# include <immintrin.h>
19+
#endif
20+
1721
#ifdef VERIFY
1822
static void secp256k1_fe_impl_verify(const secp256k1_fe *a) {
1923
const uint64_t *d = a->n;
@@ -37,10 +41,15 @@ static void secp256k1_fe_impl_get_bounds(secp256k1_fe *r, int m) {
3741
const uint64_t bound1 = 0xFFFFFFFFFFFFFULL * two_m;
3842
const uint64_t bound2 = 0x0FFFFFFFFFFFFULL * two_m;
3943

44+
#ifdef __AVX2__
45+
__m256i vec = _mm256_set1_epi64x(bound1);
46+
_mm256_storeu_si256((__m256i *)r->n, vec);
47+
#else
4048
r->n[0] = bound1;
4149
r->n[1] = bound1;
4250
r->n[2] = bound1;
4351
r->n[3] = bound1;
52+
#endif
4453
r->n[4] = bound2;
4554
}
4655

@@ -209,6 +218,7 @@ SECP256K1_INLINE static void secp256k1_fe_impl_set_int(secp256k1_fe *r, int a) {
209218

210219
SECP256K1_INLINE static int secp256k1_fe_impl_is_zero(const secp256k1_fe *a) {
211220
const uint64_t *t = a->n;
221+
/* TODO: parallelize. mm_testz */
212222
return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
213223
}
214224

@@ -239,6 +249,8 @@ static void secp256k1_fe_impl_set_b32_mod(secp256k1_fe *r, const unsigned char *
239249
limbs[3] = BYTESWAP_64(limbs[3]);
240250
#endif
241251

252+
/* TODO: parallelize avx2 */
253+
242254
r->n[0] = (limbs[3] & 0xFFFFFFFFFFFFFULL);
243255
r->n[1] = (limbs[3] >> 52) | ((limbs[2] & 0xFFFFFFFFFFULL) << 12);
244256
r->n[2] = (limbs[2] >> 40) | ((limbs[1] & 0xFFFFFFFULL) << 24);
@@ -291,6 +303,10 @@ static void secp256k1_fe_impl_get_b32(unsigned char *r, const secp256k1_fe *a) {
291303
}
292304

293305
SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r, const secp256k1_fe *a, int m) {
306+
#ifdef __AVX2__
307+
/* load here to mitigate load latency */
308+
__m256i vec_a = _mm256_loadu_si256((__m256i *)a->n);
309+
#endif
294310
const uint32_t two_m1 = 2 * (m + 1);
295311
const uint64_t bound1 = 0xFFFFEFFFFFC2FULL * two_m1;
296312
const uint64_t bound2 = 0xFFFFFFFFFFFFFULL * two_m1;
@@ -303,10 +319,18 @@ SECP256K1_INLINE static void secp256k1_fe_impl_negate_unchecked(secp256k1_fe *r,
303319

304320
/* Due to the properties above, the left hand in the subtractions below is never less than
305321
* the right hand. */
322+
#ifdef __AVX2__
323+
{
324+
__m256i vec_bounds = _mm256_setr_epi64x(bound1, bound2, bound2, bound2);
325+
__m256i out = _mm256_sub_epi64(vec_bounds, vec_a);
326+
_mm256_storeu_si256((__m256i *)r->n, out);
327+
}
328+
#else
306329
r->n[0] = bound1 - a->n[0];
307330
r->n[1] = bound2 - a->n[1];
308331
r->n[2] = bound2 - a->n[2];
309332
r->n[3] = bound2 - a->n[3];
333+
#endif
310334
r->n[4] = bound3 - a->n[4];
311335
}
312336

@@ -339,15 +363,32 @@ SECP256K1_INLINE static void secp256k1_fe_impl_sqr(secp256k1_fe *r, const secp25
339363
}
340364

341365
SECP256K1_INLINE static void secp256k1_fe_impl_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
366+
#ifdef __AVX2__
367+
/* load here to mitigate load latency */
368+
__m256i vec_r = _mm256_loadu_si256((__m256i *)(r->n));
369+
__m256i vec_a = _mm256_loadu_si256((__m256i *)(a->n));
370+
#endif
371+
342372
uint64_t mask0, mask1;
343373
volatile int vflag = flag;
344374
SECP256K1_CHECKMEM_CHECK_VERIFY(r->n, sizeof(r->n));
345375
mask0 = vflag + ~((uint64_t)0);
346376
mask1 = ~mask0;
377+
378+
#ifdef __AVX2__
379+
{
380+
__m256i vec_mask0 = _mm256_set1_epi64x(mask0);
381+
__m256i vec_mask1 = _mm256_set1_epi64x(mask1);
382+
vec_r = _mm256_and_si256(vec_r, vec_mask0);
383+
vec_a = _mm256_and_si256(vec_a, vec_mask1);
384+
_mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(vec_r, vec_a));
385+
}
386+
#else
347387
r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
348388
r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
349389
r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
350390
r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
391+
#endif
351392
r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
352393
}
353394

@@ -418,19 +459,42 @@ static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r,
418459
}
419460

420461
static void secp256k1_fe_impl_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
462+
#ifdef __AVX2__
463+
__m256i limbs_0123 = _mm256_loadu_si256((__m256i *)a->n);
464+
__m256i limbs_1234 = _mm256_loadu_si256((__m256i *)(a->n + 1));
465+
const __m256i shift_lhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
466+
const __m256i shift_rhs = _mm256_setr_epi64x(52, 40, 28, 16); /* TODO: precompute */
467+
__m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
468+
__m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
469+
_mm256_storeu_si256((__m256i *)r->n, _mm256_or_si256(lhs, rhs));
470+
#else
421471
r->n[0] = a->n[0] | a->n[1] << 52;
422472
r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
423473
r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
424474
r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
475+
#endif
425476
}
426477

427478
static SECP256K1_INLINE void secp256k1_fe_impl_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
428479
const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3];
429480

481+
#ifdef __AVX2__
482+
{
483+
__m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
484+
__m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
485+
const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 40, 28); /* TODO: precompute */
486+
const __m256i shift_rhs = _mm256_setr_epi64x(0, 12, 24, 36); /* TODO: precompute */
487+
const __m256i mask52 = _mm256_set1_epi64x(0xFFFFFFFFFFFFFULL); /* TODO: precompute */
488+
__m256i rhs = _mm256_and_si256(_mm256_sllv_epi64(limbs_0123, shift_rhs), mask52);
489+
__m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
490+
_mm256_storeu_si256((__m256i*)r->n, _mm256_or_si256(lhs, rhs));
491+
}
492+
#else
430493
r->n[0] = a0 & 0xFFFFFFFFFFFFFULL;
431494
r->n[1] = a0 >> 52 | ((a1 << 12) & 0xFFFFFFFFFFFFFULL);
432495
r->n[2] = a1 >> 40 | ((a2 << 24) & 0xFFFFFFFFFFFFFULL);
433496
r->n[3] = a2 >> 28 | ((a3 << 36) & 0xFFFFFFFFFFFFFULL);
497+
#endif
434498
r->n[4] = a3 >> 16;
435499
}
436500

@@ -447,21 +511,49 @@ static void secp256k1_fe_from_signed62(secp256k1_fe *r, const secp256k1_modinv64
447511
VERIFY_CHECK(a3 >> 62 == 0);
448512
VERIFY_CHECK(a4 >> 8 == 0);
449513

514+
#ifdef __AVX2__
515+
{
516+
__m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
517+
__m256i limbs_0012 = _mm256_setr_epi64x(a0, a0, a1, a2);
518+
const __m256i shift_lhs = _mm256_setr_epi64x(64, 52, 42, 32); /*TODO: precompute */
519+
const __m256i shift_rhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
520+
const __m256i mask52 = _mm256_set1_epi64x(M52); /*TODO: precompute */
521+
__m256i rhs = _mm256_sllv_epi64(limbs_0123, shift_rhs);
522+
__m256i lhs = _mm256_srlv_epi64(limbs_0012, shift_lhs);
523+
__m256i out = _mm256_or_si256(lhs, rhs);
524+
_mm256_storeu_si256((__m256i*)r->n, _mm256_and_si256(out, mask52));
525+
}
526+
#else
450527
r->n[0] = a0 & M52;
451528
r->n[1] = (a0 >> 52 | a1 << 10) & M52;
452529
r->n[2] = (a1 >> 42 | a2 << 20) & M52;
453530
r->n[3] = (a2 >> 32 | a3 << 30) & M52;
531+
#endif
454532
r->n[4] = (a3 >> 22 | a4 << 40);
455533
}
456534

457535
static void secp256k1_fe_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_fe *a) {
458536
const uint64_t M62 = UINT64_MAX >> 2;
459537
const uint64_t a0 = a->n[0], a1 = a->n[1], a2 = a->n[2], a3 = a->n[3], a4 = a->n[4];
460538

539+
#ifdef __AVX2__
540+
{
541+
__m256i limbs_0123 = _mm256_setr_epi64x(a0, a1, a2, a3);
542+
__m256i limbs_1234 = _mm256_setr_epi64x(a1, a2, a3, a4);
543+
const __m256i shift_lhs = _mm256_setr_epi64x(0, 10, 20, 30); /*TODO: precompute */
544+
const __m256i shift_rhs = _mm256_setr_epi64x(52, 42, 32, 22); /*TODO: precompute */
545+
const __m256i mask62 = _mm256_set1_epi64x(M62); /*TODO: precompute */
546+
__m256i lhs = _mm256_srlv_epi64(limbs_0123, shift_lhs);
547+
__m256i rhs = _mm256_sllv_epi64(limbs_1234, shift_rhs);
548+
__m256i out = _mm256_or_si256(lhs, rhs);
549+
_mm256_storeu_si256((__m256i *)r->v, _mm256_and_si256(out, mask62));
550+
}
551+
#else
461552
r->v[0] = (a0 | a1 << 52) & M62;
462553
r->v[1] = (a1 >> 10 | a2 << 42) & M62;
463554
r->v[2] = (a2 >> 20 | a3 << 32) & M62;
464555
r->v[3] = (a3 >> 30 | a4 << 22) & M62;
556+
#endif
465557
r->v[4] = a4 >> 40;
466558
}
467559

src/hash_impl.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include <stdint.h>
1515
#include <string.h>
1616

17+
#ifdef X86
18+
# include <immintrin.h>
19+
#endif
20+
1721
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
1822
#define Maj(x,y,z) (((x) & (y)) | ((z) & ((x) | (y))))
1923
#define Sigma0(x) (((x) >> 2 | (x) << 30) ^ ((x) >> 13 | (x) << 19) ^ ((x) >> 22 | (x) << 10))
@@ -29,6 +33,13 @@
2933
} while(0)
3034

3135
static void secp256k1_sha256_initialize(secp256k1_sha256 *hash) {
36+
#if defined(__AVX2__)
37+
const __m256i vec = _mm256_setr_epi32( /* TODO: precompute */
38+
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
39+
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
40+
);
41+
_mm256_storeu_si256((__m256i *)hash->s, vec);
42+
#else
3243
hash->s[0] = 0x6a09e667ul;
3344
hash->s[1] = 0xbb67ae85ul;
3445
hash->s[2] = 0x3c6ef372ul;
@@ -37,6 +48,7 @@ static void secp256k1_sha256_initialize(secp256k1_sha256 *hash) {
3748
hash->s[5] = 0x9b05688cul;
3849
hash->s[6] = 0x1f83d9abul;
3950
hash->s[7] = 0x5be0cd19ul;
51+
#endif
4052
hash->bytes = 0;
4153
}
4254

@@ -45,6 +57,8 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) {
4557
uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4], f = s[5], g = s[6], h = s[7];
4658
uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
4759

60+
/* TODO: load wX in parallel */
61+
4862
Round(a, b, c, d, e, f, g, h, 0x428a2f98, w0 = secp256k1_read_be32(&buf[0]));
4963
Round(h, a, b, c, d, e, f, g, 0x71374491, w1 = secp256k1_read_be32(&buf[4]));
5064
Round(g, h, a, b, c, d, e, f, 0xb5c0fbcf, w2 = secp256k1_read_be32(&buf[8]));
@@ -62,6 +76,8 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) {
6276
Round(c, d, e, f, g, h, a, b, 0x9bdc06a7, w14 = secp256k1_read_be32(&buf[56]));
6377
Round(b, c, d, e, f, g, h, a, 0xc19bf174, w15 = secp256k1_read_be32(&buf[60]));
6478

79+
/* TODO: sum wX in parallel */
80+
6581
Round(a, b, c, d, e, f, g, h, 0xe49b69c1, w0 += sigma1(w14) + w9 + sigma0(w1));
6682
Round(h, a, b, c, d, e, f, g, 0xefbe4786, w1 += sigma1(w15) + w10 + sigma0(w2));
6783
Round(g, h, a, b, c, d, e, f, 0x0fc19dc6, w2 += sigma1(w0) + w11 + sigma0(w3));
@@ -79,6 +95,8 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) {
7995
Round(c, d, e, f, g, h, a, b, 0x06ca6351, w14 += sigma1(w12) + w7 + sigma0(w15));
8096
Round(b, c, d, e, f, g, h, a, 0x14292967, w15 += sigma1(w13) + w8 + sigma0(w0));
8197

98+
/* TODO: sum wX in parallel */
99+
82100
Round(a, b, c, d, e, f, g, h, 0x27b70a85, w0 += sigma1(w14) + w9 + sigma0(w1));
83101
Round(h, a, b, c, d, e, f, g, 0x2e1b2138, w1 += sigma1(w15) + w10 + sigma0(w2));
84102
Round(g, h, a, b, c, d, e, f, 0x4d2c6dfc, w2 += sigma1(w0) + w11 + sigma0(w3));
@@ -96,6 +114,8 @@ static void secp256k1_sha256_transform(uint32_t* s, const unsigned char* buf) {
96114
Round(c, d, e, f, g, h, a, b, 0xf40e3585, w14 += sigma1(w12) + w7 + sigma0(w15));
97115
Round(b, c, d, e, f, g, h, a, 0x106aa070, w15 += sigma1(w13) + w8 + sigma0(w0));
98116

117+
/* TODO: sum wX in parallel */
118+
99119
Round(a, b, c, d, e, f, g, h, 0x19a4c116, w0 += sigma1(w14) + w9 + sigma0(w1));
100120
Round(h, a, b, c, d, e, f, g, 0x1e376c08, w1 += sigma1(w15) + w10 + sigma0(w2));
101121
Round(g, h, a, b, c, d, e, f, 0x2748774c, w2 += sigma1(w0) + w11 + sigma0(w3));

0 commit comments

Comments
 (0)