From 93f3146f671efef734ef05ec06ffd48d961148ee Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 8 Dec 2025 18:16:22 +0800 Subject: [PATCH 1/2] Benchmarks: Add component benchmarks for native functions - poly_pointwise_montgomery - poly_caddq - poly_chknorm - poly_decompose - poly_use_hint - polyz_unpack Signed-off-by: Matthias J. Kannwischer --- test/bench_components_mldsa.c | 62 ++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/test/bench_components_mldsa.c b/test/bench_components_mldsa.c index b6dc9e8e7..e10487a77 100644 --- a/test/bench_components_mldsa.c +++ b/test/bench_components_mldsa.c @@ -10,6 +10,7 @@ #include #include "../mldsa/src/ntt.h" #include "../mldsa/src/poly.h" +#include "../mldsa/src/poly_kl.h" #include "../mldsa/src/polyvec.h" #include "../mldsa/src/randombytes.h" #include "hal.h" @@ -23,36 +24,40 @@ static int cmp_uint64_t(const void *a, const void *b) return (int)((*((const uint64_t *)a)) - (*((const uint64_t *)b))); } -#define BENCH(txt, code) \ - for (i = 0; i < NTESTS; i++) \ - { \ - mld_randombytes((uint8_t *)data0, sizeof(data0)); \ - mld_randombytes((uint8_t *)&polyvecl_a, sizeof(polyvecl_a)); \ - mld_randombytes((uint8_t *)&polyvecl_b, sizeof(polyvecl_b)); \ - mld_randombytes((uint8_t *)&polymat, sizeof(polymat)); \ - for (j = 0; j < NWARMUP; j++) \ - { \ - code; \ - } \ - \ - t0 = get_cyclecounter(); \ - for (j = 0; j < NITERATIONS; j++) \ - { \ - code; \ - } \ - t1 = get_cyclecounter(); \ - (cyc)[i] = t1 - t0; \ - } \ - qsort((cyc), NTESTS, sizeof(uint64_t), cmp_uint64_t); \ +#define BENCH(txt, code) \ + for (i = 0; i < NTESTS; i++) \ + { \ + mld_randombytes((uint8_t *)data0, sizeof(data0)); \ + mld_randombytes((uint8_t *)&poly_out, sizeof(poly_out)); \ + mld_randombytes((uint8_t *)&poly_hint, sizeof(poly_hint)); \ + mld_randombytes((uint8_t *)&polyvecl_a, sizeof(polyvecl_a)); \ + mld_randombytes((uint8_t *)&polyvecl_b, sizeof(polyvecl_b)); \ + mld_randombytes((uint8_t *)&polymat, sizeof(polymat)); \ + mld_randombytes((uint8_t *)polyz_packed, sizeof(polyz_packed)); \ + for (j = 0; j < NWARMUP; j++) \ + { \ + code; \ + } \ + \ + t0 = get_cyclecounter(); \ + for (j = 0; j < NITERATIONS; j++) \ + { \ + code; \ + } \ + t1 = get_cyclecounter(); \ + (cyc)[i] = t1 - t0; \ + } \ + qsort((cyc), NTESTS, sizeof(uint64_t), cmp_uint64_t); \ printf(txt " cycles=%" PRIu64 "\n", (cyc)[NTESTS >> 1] / NITERATIONS); static int bench(void) { MLD_ALIGN int32_t data0[256]; - MLD_ALIGN mld_poly poly_out; + MLD_ALIGN mld_poly poly_out, poly_a1, poly_a0, poly_hint; MLD_ALIGN mld_polyvecl polyvecl_a, polyvecl_b; MLD_ALIGN mld_polyveck polyveck_out; MLD_ALIGN mld_polymat polymat; + MLD_ALIGN uint8_t polyz_packed[MLDSA_POLYZ_PACKEDBYTES]; uint64_t cyc[NTESTS]; unsigned i, j; uint64_t t0, t1; @@ -62,6 +67,9 @@ static int bench(void) BENCH("poly_invntt_tomont", mld_poly_invntt_tomont((mld_poly *)data0)) /* pointwise */ + BENCH("poly_pointwise_montgomery", + mld_poly_pointwise_montgomery(&poly_out, &polyvecl_a.vec[0], + &polyvecl_b.vec[0])) BENCH("polyvecl_pointwise_acc_montgomery", mld_polyvecl_pointwise_acc_montgomery(&poly_out, &polyvecl_a, &polyvecl_b)) @@ -69,6 +77,16 @@ static int bench(void) mld_polyvec_matrix_pointwise_montgomery(&polyveck_out, &polymat, &polyvecl_b)) + /* poly arithmetic */ + BENCH("poly_caddq", mld_poly_caddq(&poly_out)) + BENCH("poly_chknorm", mld_poly_chknorm(&poly_out, MLDSA_GAMMA1 - 1)) + BENCH("poly_decompose", mld_poly_decompose(&poly_a1, &poly_a0, &poly_out)) + BENCH("poly_use_hint", + mld_poly_use_hint(&poly_out, &polyvecl_a.vec[0], &poly_hint)) + + /* packing */ + BENCH("polyz_unpack", mld_polyz_unpack(&poly_out, polyz_packed)) + return 0; } From b5e75c640249e445d5539c702dae95f19574aaa1 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 8 Dec 2025 18:25:07 +0800 Subject: [PATCH 2/2] SLOTHY: Superoptimize `poly_pointwise_montgomery` Resolves https://github.com/pq-code-package/mldsa-native/issues/758 Signed-off-by: Matthias J. Kannwischer --- dev/aarch64_opt/src/Makefile | 2 +- dev/aarch64_opt/src/pointwise_montgomery.S | 335 ++++++++++++++++-- .../native/aarch64/src/pointwise_montgomery.S | 163 ++++++--- 3 files changed, 422 insertions(+), 78 deletions(-) diff --git a/dev/aarch64_opt/src/Makefile b/dev/aarch64_opt/src/Makefile index 4f280d4b2..2ff8e74a6 100644 --- a/dev/aarch64_opt/src/Makefile +++ b/dev/aarch64_opt/src/Makefile @@ -86,7 +86,7 @@ mld_polyvecl_pointwise_acc_montgomery_l7.S: ../../aarch64_clean/src/mld_polyvecl cp $< $@ pointwise_montgomery.S: ../../aarch64_clean/src/pointwise_montgomery.S - cp $< $@ + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l loop_start $(SLOTHY_FLAGS) $(RESERVE_ALL_FLAG) poly_caddq_asm.S: ../../aarch64_clean/src/poly_caddq_asm.S cp $< $@ diff --git a/dev/aarch64_opt/src/pointwise_montgomery.S b/dev/aarch64_opt/src/pointwise_montgomery.S index fd7aac729..43a9d600a 100644 --- a/dev/aarch64_opt/src/pointwise_montgomery.S +++ b/dev/aarch64_opt/src/pointwise_montgomery.S @@ -1,6 +1,6 @@ /* Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) @@ -85,41 +85,304 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) mov count, #(MLDSA_N / 4) + // Instructions: 34 + // Expected cycles: 24 + // Expected IPC: 1.42 + // + // Cycle bound: 24.0 + // IPC bound: 1.42 + // + // Wall time: 0.14s + // User time: 0.14s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q3, [x2, #48] // *............................. + ldr q28, [x1, #48] // *............................. + ldr q5, [x1], #4*16 // .*............................ + ldr q23, [x2, #16] // .*............................ + ldr q2, [x2], #4*16 // ..*........................... + ldr q19, [x1, #-48] // ..*........................... + ldr q31, [x1, #-32] // ...*.......................... + ldr q7, [x2, #-32] // ...*.......................... + smull v26.2D, v28.2S, v3.2S // ....*......................... + ldr q25, [x2, #32] // ....*......................... + smull2 v24.2D, v28.4S, v3.4S // .....*........................ + ldr q28, [x2], #4*16 // .....*........................ + smull v29.2D, v5.2S, v2.2S // ......*....................... + ldr q6, [x1, #48] // ......*....................... + smull2 v5.2D, v5.4S, v2.4S // .......*...................... + ldr q22, [x1, #16] // .......*...................... + smull v16.2D, v19.2S, v23.2S // ........*..................... + smull2 v18.2D, v19.4S, v23.4S // .........*.................... + uzp1 v27.4S, v26.4S, v24.4S // .........*.................... + smull2 v17.2D, v31.4S, v7.4S // ..........*................... + mul v19.4S, v27.4S, v1.4S // ...........*.................. + uzp1 v21.4S, v29.4S, v5.4S // ...........*.................. + smull v4.2D, v31.2S, v7.2S // .............*................ + uzp1 v20.4S, v16.4S, v18.4S // .............*................ + mul v23.4S, v21.4S, v1.4S // ..............*............... + smlsl v26.2D, v19.2S, v0.2S // ................*............. + smlsl2 v24.2D, v19.4S, v0.4S // .................*............ + uzp1 v19.4S, v4.4S, v17.4S // .................*............ + mul v31.4S, v20.4S, v1.4S // ..................*........... + smlsl v29.2D, v23.2S, v0.2S // ....................*......... + uzp2 v2.4S, v26.4S, v24.4S // .....................*........ + mul v24.4S, v19.4S, v1.4S // .....................*........ + smlsl v16.2D, v31.2S, v0.2S // .......................*...... + str q2, [x0, #48] // .......................*...... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x1, #48] // *.............................. + // ldr q28, [x2], #4*16 // ..*............................ + // ldr q22, [x1, #16] // ..*............................ + // ldr q25, [x2, #-32] // ...*........................... + // ldr q2, [x2, #-16] // *.............................. + // ldr q20, [x1], #4*16 // .*............................. + // ldr q23, [x2, #-48] // .*............................. + // ldr q24, [x1, #-32] // ...*........................... + // smull2 v26.2D, v6.4S, v2.4S // .....*......................... + // smull v2.2D, v6.2S, v2.2S // ....*.......................... + // smull2 v18.2D, v22.4S, v23.4S // .........*..................... + // smull v16.2D, v22.2S, v23.2S // ........*...................... + // smull v29.2D, v20.2S, v28.2S // ......*........................ + // smull2 v5.2D, v20.4S, v28.4S // .......*....................... + // uzp1 v27.4S, v2.4S, v26.4S // .........*..................... + // smull v4.2D, v24.2S, v25.2S // .............*................. + // mul v23.4S, v27.4S, v1.4S // ...........*................... + // uzp1 v30.4S, v16.4S, v18.4S // .............*................. + // ldr q6, [x1, #48] // ......*........................ + // ldr q28, [x2], #4*16 // .....*......................... + // mul v31.4S, v30.4S, v1.4S // ..................*............ + // uzp1 v7.4S, v29.4S, v5.4S // ...........*................... + // ldr q22, [x1, #16] // .......*....................... + // smull2 v17.2D, v24.4S, v25.4S // ..........*.................... + // ldr q25, [x2, #-32] // ....*.......................... + // smlsl v2.2D, v23.2S, v0.2S // ................*.............. + // smlsl2 v26.2D, v23.4S, v0.4S // .................*............. + // mul v23.4S, v7.4S, v1.4S // ..............*................ + // uzp1 v20.4S, v4.4S, v17.4S // .................*............. + // smlsl v16.2D, v31.2S, v0.2S // .......................*....... + // uzp2 v19.4S, v2.4S, v26.4S // .....................*......... + // mul v24.4S, v20.4S, v1.4S // .....................*......... + // str q19, [x0, #48] // .......................*....... + // smlsl v29.2D, v23.2S, v0.2S // ....................*.......... + + sub count, count, #8 loop_start: - ldr q_a_1, [a_ptr, #1*16] - ldr q_a_2, [a_ptr, #2*16] - ldr q_a_3, [a_ptr, #3*16] - ldr q_a_0, [a_ptr], #4*16 - - ldr q_b_1, [b_ptr, #1*16] - ldr q_b_2, [b_ptr, #2*16] - ldr q_b_3, [b_ptr, #3*16] - ldr q_b_0, [b_ptr], #4*16 - // Bounds: |a_{i}|, |b_{i}| < 9q - - pmull c_0_lo, c_0_hi, a_0, b_0 - pmull c_1_lo, c_1_hi, a_1, b_1 - pmull c_2_lo, c_2_hi, a_2, b_2 - pmull c_3_lo, c_3_hi, a_3, b_3 - // Bounds: |c_{i}_lo|, |c_{i}_hi| < 81q^2 < qR/2 - - montgomery_reduce_long c_0, c_0_lo, c_0_hi - montgomery_reduce_long c_1, c_1_lo, c_1_hi - montgomery_reduce_long c_2, c_2_lo, c_2_hi - montgomery_reduce_long c_3, c_3_lo, c_3_hi - // All coefficients are Montgomery-reduced, resulting in - // - // Bounds: |c_{i}| < q - // - // See description of mld_montgomery_reduce() in mldsa/src/reduce.h. - - str q_c_1, [out_ptr, #1*16] - str q_c_2, [out_ptr, #2*16] - str q_c_3, [out_ptr, #3*16] - str q_c_0, [out_ptr], #4*16 - - subs count, count, #4 - cbnz count, loop_start + // Instructions: 40 + // Expected cycles: 24 + // Expected IPC: 1.67 + // + // Cycle bound: 24.0 + // IPC bound: 1.67 + // + // Wall time: 1.52s + // User time: 1.52s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + smlsl2 v5.2D, v23.4S, v0.4S // l............................. + ldr q2, [x2, #-16] // *............................. + smlsl2 v18.2D, v31.4S, v0.4S // .l............................ + ldr q20, [x1], #4*16 // .*............................ + smlsl v4.2D, v24.2S, v0.2S // ..l........................... + ldr q23, [x2, #-48] // ..*........................... + smlsl2 v17.2D, v24.4S, v0.4S // ...l.......................... + ldr q24, [x1, #-32] // ...*.......................... + uzp2 v27.4S, v29.4S, v5.4S // ....l......................... + smull2 v26.2D, v6.4S, v2.4S // ....*......................... + smull v2.2D, v6.2S, v2.2S // .....*........................ + uzp2 v19.4S, v16.4S, v18.4S // .....l........................ + smull2 v18.2D, v22.4S, v23.4S // ......*....................... + str q27, [x0], #4*16 // ......l....................... + smull v16.2D, v22.2S, v23.2S // .......*...................... + str q19, [x0, #-48] // .......l...................... + smull v29.2D, v20.2S, v28.2S // ........*..................... + uzp2 v3.4S, v4.4S, v17.4S // ........l..................... + smull2 v5.2D, v20.4S, v28.4S // .........*.................... + uzp1 v27.4S, v2.4S, v26.4S // .........*.................... + str q3, [x0, #-32] // ..........l................... + smull v4.2D, v24.2S, v25.2S // ..........*................... + mul v23.4S, v27.4S, v1.4S // ...........*.................. + uzp1 v30.4S, v16.4S, v18.4S // ...........*.................. + ldr q6, [x1, #48] // ............e................. + ldr q28, [x2], #4*16 // ............e................. + mul v31.4S, v30.4S, v1.4S // .............*................ + uzp1 v7.4S, v29.4S, v5.4S // .............*................ + ldr q22, [x1, #16] // ..............e............... + smull2 v17.2D, v24.4S, v25.4S // ...............*.............. + ldr q25, [x2, #-32] // ...............e.............. + smlsl v2.2D, v23.2S, v0.2S // ................*............. + smlsl2 v26.2D, v23.4S, v0.4S // .................*............ + mul v23.4S, v7.4S, v1.4S // ..................*........... + uzp1 v20.4S, v4.4S, v17.4S // ...................*.......... + smlsl v16.2D, v31.2S, v0.2S // ....................*......... + uzp2 v19.4S, v2.4S, v26.4S // .....................*........ + mul v24.4S, v20.4S, v1.4S // .....................*........ + str q19, [x0, #48] // .......................*...... + smlsl v29.2D, v23.2S, v0.2S // .......................*...... + + // -------------- cycle (expected) --------------> + // 0 25 + // |------------------------|--------------------- + // ldr q17, [x1, #1*16] // ..e.........'.............~.........'.......... + // ldr q18, [x1, #2*16] // ............'..*....................'..~....... + // ldr q19, [x1, #3*16] // e...........'...........~...........'.......... + // ldr q16, [x1], #4*16 // ............'*......................'~......... + // ldr q21, [x2, #1*16] // ............'.*.....................'.~........ + // ldr q22, [x2, #2*16] // ...e........'..............~........'.......... + // ldr q23, [x2, #3*16] // ............*.......................~.......... + // ldr q20, [x2], #4*16 // e...........'...........~...........'.......... + // smull v24.2d, v16.2s, v20.2s // ............'.......*...............'.......~.. + // smull2 v25.2d, v16.4s, v20.4s // ............'........*..............'........~. + // smull v26.2d, v17.2s, v21.2s // ............'......*................'......~... + // smull2 v27.2d, v17.4s, v21.4s // ............'.....*.................'.....~.... + // smull v28.2d, v18.2s, v22.2s // ............'.........*.............'.......... + // smull2 v29.2d, v18.4s, v22.4s // ...~........'..............*........'.......... + // smull v30.2d, v19.2s, v23.2s // ............'....*..................'....~..... + // smull2 v31.2d, v19.4s, v23.4s // ............'...*...................'...~...... + // uzp1 v16.4s, v24.4s, v25.4s // .~..........'............*..........'.......... + // mul v16.4s, v16.4s, v1.4s // ......~.....'.................*.....'.......... + // smlsl v24.2d, v16.2s, v0.2s // ...........~'......................*'.......... + // smlsl2 v25.2d, v16.4s, v0.4s // ............~.......................l.......... + // uzp2 v16.4s, v24.4s, v25.4s // ............'...~...................'...l...... + // uzp1 v17.4s, v26.4s, v27.4s // ............'..........*............'.......... + // mul v17.4s, v17.4s, v1.4s // .~..........'............*..........'.......... + // smlsl v26.2d, v17.2s, v0.2s // ........~...'...................*...'.......... + // smlsl2 v27.2d, v17.4s, v0.4s // ............'~......................'l......... + // uzp2 v17.4s, v26.4s, v27.4s // ............'....~..................'....l..... + // uzp1 v18.4s, v28.4s, v29.4s // .......~....'..................*....'.......... + // mul v18.4s, v18.4s, v1.4s // .........~..'....................*..'.......... + // smlsl v28.2d, v18.2s, v0.2s // ............'.~.....................'.l........ + // smlsl2 v29.2d, v18.4s, v0.4s // ............'..~....................'..l....... + // uzp2 v18.4s, v28.4s, v29.4s // ............'.......~...............'.......l.. + // uzp1 v19.4s, v30.4s, v31.4s // ............'........*..............'........~. + // mul v19.4s, v19.4s, v1.4s // ............'..........*............'.......... + // smlsl v30.2d, v19.2s, v0.2s // ....~.......'...............*.......'.......... + // smlsl2 v31.2d, v19.4s, v0.4s // .....~......'................*......'.......... + // uzp2 v19.4s, v30.4s, v31.4s // .........~..'....................*..'.......... + // str q17, [x0, #1*16] // ............'......~................'......l... + // str q18, [x0, #2*16] // ............'.........~.............'.........l + // str q19, [x0, #3*16] // ...........~'......................*'.......... + // str q16, [x0], #4*16 // ............'.....~.................'.....l.... + + subs count, count, 4 + cbnz count, loop_start + // Instructions: 46 + // Expected cycles: 34 + // Expected IPC: 1.35 + // + // Cycle bound: 34.0 + // IPC bound: 1.35 + // + // Wall time: 0.22s + // User time: 0.22s + // + // ------- cycle (expected) --------> + // 0 25 + // |------------------------|-------- + smlsl2 v18.2D, v31.4S, v0.4S // *................................. + ldr q19, [x2, #-16] // *................................. + smlsl v4.2D, v24.2S, v0.2S // .*................................ + ldr q30, [x1, #32] // .*................................ + smlsl2 v17.2D, v24.4S, v0.4S // ..*............................... + ldr q20, [x2, #-48] // ..*............................... + smlsl2 v5.2D, v23.4S, v0.4S // ...*.............................. + ldr q2, [x1], #4*16 // ...*.............................. + uzp2 v16.4S, v16.4S, v18.4S // ....*............................. + smull2 v26.2D, v6.4S, v19.4S // ....*............................. + smull v31.2D, v6.2S, v19.2S // .....*............................ + uzp2 v18.4S, v4.4S, v17.4S // ......*........................... + smull v24.2D, v30.2S, v25.2S // ......*........................... + smull2 v6.2D, v30.4S, v25.4S // .......*.......................... + uzp2 v3.4S, v29.4S, v5.4S // .......*.......................... + str q18, [x0, #32] // ........*......................... + smull2 v19.2D, v22.4S, v20.4S // ........*......................... + smull v23.2D, v22.2S, v20.2S // .........*........................ + uzp1 v27.4S, v31.4S, v26.4S // .........*........................ + str q16, [x0, #16] // ..........*....................... + smull v7.2D, v2.2S, v28.2S // ..........*....................... + smull2 v5.2D, v2.4S, v28.4S // ...........*...................... + uzp1 v22.4S, v24.4S, v6.4S // ...........*...................... + str q3, [x0], #4*16 // ............*..................... + mul v17.4S, v27.4S, v1.4S // ............*..................... + uzp1 v30.4S, v23.4S, v19.4S // .............*.................... + mul v21.4S, v22.4S, v1.4S // ..............*................... + uzp1 v18.4S, v7.4S, v5.4S // ...............*.................. + mul v22.4S, v30.4S, v1.4S // ................*................. + smlsl2 v26.2D, v17.4S, v0.4S // ..................*............... + mul v3.4S, v18.4S, v1.4S // ...................*.............. + smlsl v31.2D, v17.2S, v0.2S // .....................*............ + smlsl v24.2D, v21.2S, v0.2S // ......................*........... + smlsl2 v19.2D, v22.4S, v0.4S // .......................*.......... + smlsl2 v5.2D, v3.4S, v0.4S // ........................*......... + smlsl v7.2D, v3.2S, v0.2S // .........................*........ + uzp2 v25.4S, v31.4S, v26.4S // .........................*........ + smlsl v23.2D, v22.2S, v0.2S // ..........................*....... + smlsl2 v6.2D, v21.4S, v0.4S // ...........................*...... + str q25, [x0, #48] // ...........................*...... + uzp2 v26.4S, v7.4S, v5.4S // .............................*.... + uzp2 v19.4S, v23.4S, v19.4S // ..............................*... + uzp2 v21.4S, v24.4S, v6.4S // ...............................*.. + str q26, [x0], #4*16 // ...............................*.. + str q19, [x0, #-48] // ................................*. + str q21, [x0, #-32] // .................................* + + // ------- cycle (expected) --------> + // 0 25 + // |------------------------|-------- + // smlsl2 v5.2D, v23.4S, v0.4S // ...*.............................. + // ldr q2, [x2, #-16] // *................................. + // smlsl2 v18.2D, v31.4S, v0.4S // *................................. + // ldr q20, [x1], #4*16 // ...*.............................. + // smlsl v4.2D, v24.2S, v0.2S // .*................................ + // ldr q23, [x2, #-48] // ..*............................... + // smlsl2 v17.2D, v24.4S, v0.4S // ..*............................... + // ldr q24, [x1, #-32] // .*................................ + // uzp2 v27.4S, v29.4S, v5.4S // .......*.......................... + // smull2 v26.2D, v6.4S, v2.4S // ....*............................. + // smull v2.2D, v6.2S, v2.2S // .....*............................ + // uzp2 v19.4S, v16.4S, v18.4S // ....*............................. + // smull2 v18.2D, v22.4S, v23.4S // ........*......................... + // str q27, [x0], #4*16 // ............*..................... + // smull v16.2D, v22.2S, v23.2S // .........*........................ + // str q19, [x0, #-48] // ..........*....................... + // smull v29.2D, v20.2S, v28.2S // ..........*....................... + // uzp2 v3.4S, v4.4S, v17.4S // ......*........................... + // smull2 v5.2D, v20.4S, v28.4S // ...........*...................... + // uzp1 v27.4S, v2.4S, v26.4S // .........*........................ + // str q3, [x0, #-32] // ........*......................... + // smull v4.2D, v24.2S, v25.2S // ......*........................... + // mul v23.4S, v27.4S, v1.4S // ............*..................... + // uzp1 v30.4S, v16.4S, v18.4S // .............*.................... + // mul v31.4S, v30.4S, v1.4S // ................*................. + // uzp1 v7.4S, v29.4S, v5.4S // ...............*.................. + // smull2 v17.2D, v24.4S, v25.4S // .......*.......................... + // smlsl v2.2D, v23.2S, v0.2S // .....................*............ + // smlsl2 v26.2D, v23.4S, v0.4S // ..................*............... + // mul v23.4S, v7.4S, v1.4S // ...................*.............. + // uzp1 v20.4S, v4.4S, v17.4S // ...........*...................... + // smlsl v16.2D, v31.2S, v0.2S // ..........................*....... + // uzp2 v19.4S, v2.4S, v26.4S // .........................*........ + // mul v24.4S, v20.4S, v1.4S // ..............*................... + // str q19, [x0, #48] // ...........................*...... + // smlsl v29.2D, v23.2S, v0.2S // .........................*........ + // smlsl2 v5.2D, v23.4S, v0.4S // ........................*......... + // smlsl2 v18.2D, v31.4S, v0.4S // .......................*.......... + // smlsl v4.2D, v24.2S, v0.2S // ......................*........... + // smlsl2 v17.2D, v24.4S, v0.4S // ...........................*...... + // uzp2 v27.4S, v29.4S, v5.4S // .............................*.... + // uzp2 v19.4S, v16.4S, v18.4S // ..............................*... + // str q27, [x0], #4*16 // ...............................*.. + // str q19, [x0, #-48] // ................................*. + // uzp2 v3.4S, v4.4S, v17.4S // ...............................*.. + // str q3, [x0, #-32] // .................................* + ret diff --git a/mldsa/src/native/aarch64/src/pointwise_montgomery.S b/mldsa/src/native/aarch64/src/pointwise_montgomery.S index bb85dec20..cf5577920 100644 --- a/mldsa/src/native/aarch64/src/pointwise_montgomery.S +++ b/mldsa/src/native/aarch64/src/pointwise_montgomery.S @@ -1,6 +1,6 @@ /* Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ #include "../../../common.h" #if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) @@ -27,50 +27,131 @@ MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm) movk w3, #0x380, lsl #16 dup v1.4s, w3 mov x3, #0x40 // =64 + ldr q3, [x2, #0x30] + ldr q28, [x1, #0x30] + ldr q5, [x1], #0x40 + ldr q23, [x2, #0x10] + ldr q2, [x2], #0x40 + ldur q19, [x1, #-0x30] + ldur q31, [x1, #-0x20] + ldur q7, [x2, #-0x20] + smull v26.2d, v28.2s, v3.2s + ldr q25, [x2, #0x20] + smull2 v24.2d, v28.4s, v3.4s + ldr q28, [x2], #0x40 + smull v29.2d, v5.2s, v2.2s + ldr q6, [x1, #0x30] + smull2 v5.2d, v5.4s, v2.4s + ldr q22, [x1, #0x10] + smull v16.2d, v19.2s, v23.2s + smull2 v18.2d, v19.4s, v23.4s + uzp1 v27.4s, v26.4s, v24.4s + smull2 v17.2d, v31.4s, v7.4s + mul v19.4s, v27.4s, v1.4s + uzp1 v21.4s, v29.4s, v5.4s + smull v4.2d, v31.2s, v7.2s + uzp1 v20.4s, v16.4s, v18.4s + mul v23.4s, v21.4s, v1.4s + smlsl v26.2d, v19.2s, v0.2s + smlsl2 v24.2d, v19.4s, v0.4s + uzp1 v19.4s, v4.4s, v17.4s + mul v31.4s, v20.4s, v1.4s + smlsl v29.2d, v23.2s, v0.2s + uzp2 v2.4s, v26.4s, v24.4s + mul v24.4s, v19.4s, v1.4s + smlsl v16.2d, v31.2s, v0.2s + str q2, [x0, #0x30] + sub x3, x3, #0x8 Lloop_start: - ldr q17, [x1, #0x10] - ldr q18, [x1, #0x20] - ldr q19, [x1, #0x30] - ldr q16, [x1], #0x40 - ldr q21, [x2, #0x10] - ldr q22, [x2, #0x20] - ldr q23, [x2, #0x30] - ldr q20, [x2], #0x40 - smull v24.2d, v16.2s, v20.2s - smull2 v25.2d, v16.4s, v20.4s - smull v26.2d, v17.2s, v21.2s - smull2 v27.2d, v17.4s, v21.4s - smull v28.2d, v18.2s, v22.2s - smull2 v29.2d, v18.4s, v22.4s - smull v30.2d, v19.2s, v23.2s - smull2 v31.2d, v19.4s, v23.4s - uzp1 v16.4s, v24.4s, v25.4s - mul v16.4s, v16.4s, v1.4s - smlsl v24.2d, v16.2s, v0.2s - smlsl2 v25.2d, v16.4s, v0.4s - uzp2 v16.4s, v24.4s, v25.4s - uzp1 v17.4s, v26.4s, v27.4s - mul v17.4s, v17.4s, v1.4s - smlsl v26.2d, v17.2s, v0.2s - smlsl2 v27.2d, v17.4s, v0.4s - uzp2 v17.4s, v26.4s, v27.4s - uzp1 v18.4s, v28.4s, v29.4s - mul v18.4s, v18.4s, v1.4s - smlsl v28.2d, v18.2s, v0.2s - smlsl2 v29.2d, v18.4s, v0.4s - uzp2 v18.4s, v28.4s, v29.4s - uzp1 v19.4s, v30.4s, v31.4s - mul v19.4s, v19.4s, v1.4s - smlsl v30.2d, v19.2s, v0.2s - smlsl2 v31.2d, v19.4s, v0.4s - uzp2 v19.4s, v30.4s, v31.4s - str q17, [x0, #0x10] - str q18, [x0, #0x20] + smlsl2 v5.2d, v23.4s, v0.4s + ldur q2, [x2, #-0x10] + smlsl2 v18.2d, v31.4s, v0.4s + ldr q20, [x1], #0x40 + smlsl v4.2d, v24.2s, v0.2s + ldur q23, [x2, #-0x30] + smlsl2 v17.2d, v24.4s, v0.4s + ldur q24, [x1, #-0x20] + uzp2 v27.4s, v29.4s, v5.4s + smull2 v26.2d, v6.4s, v2.4s + smull v2.2d, v6.2s, v2.2s + uzp2 v19.4s, v16.4s, v18.4s + smull2 v18.2d, v22.4s, v23.4s + str q27, [x0], #0x40 + smull v16.2d, v22.2s, v23.2s + stur q19, [x0, #-0x30] + smull v29.2d, v20.2s, v28.2s + uzp2 v3.4s, v4.4s, v17.4s + smull2 v5.2d, v20.4s, v28.4s + uzp1 v27.4s, v2.4s, v26.4s + stur q3, [x0, #-0x20] + smull v4.2d, v24.2s, v25.2s + mul v23.4s, v27.4s, v1.4s + uzp1 v30.4s, v16.4s, v18.4s + ldr q6, [x1, #0x30] + ldr q28, [x2], #0x40 + mul v31.4s, v30.4s, v1.4s + uzp1 v7.4s, v29.4s, v5.4s + ldr q22, [x1, #0x10] + smull2 v17.2d, v24.4s, v25.4s + ldur q25, [x2, #-0x20] + smlsl v2.2d, v23.2s, v0.2s + smlsl2 v26.2d, v23.4s, v0.4s + mul v23.4s, v7.4s, v1.4s + uzp1 v20.4s, v4.4s, v17.4s + smlsl v16.2d, v31.2s, v0.2s + uzp2 v19.4s, v2.4s, v26.4s + mul v24.4s, v20.4s, v1.4s str q19, [x0, #0x30] - str q16, [x0], #0x40 + smlsl v29.2d, v23.2s, v0.2s subs x3, x3, #0x4 cbnz x3, Lloop_start + smlsl2 v18.2d, v31.4s, v0.4s + ldur q19, [x2, #-0x10] + smlsl v4.2d, v24.2s, v0.2s + ldr q30, [x1, #0x20] + smlsl2 v17.2d, v24.4s, v0.4s + ldur q20, [x2, #-0x30] + smlsl2 v5.2d, v23.4s, v0.4s + ldr q2, [x1], #0x40 + uzp2 v16.4s, v16.4s, v18.4s + smull2 v26.2d, v6.4s, v19.4s + smull v31.2d, v6.2s, v19.2s + uzp2 v18.4s, v4.4s, v17.4s + smull v24.2d, v30.2s, v25.2s + smull2 v6.2d, v30.4s, v25.4s + uzp2 v3.4s, v29.4s, v5.4s + str q18, [x0, #0x20] + smull2 v19.2d, v22.4s, v20.4s + smull v23.2d, v22.2s, v20.2s + uzp1 v27.4s, v31.4s, v26.4s + str q16, [x0, #0x10] + smull v7.2d, v2.2s, v28.2s + smull2 v5.2d, v2.4s, v28.4s + uzp1 v22.4s, v24.4s, v6.4s + str q3, [x0], #0x40 + mul v17.4s, v27.4s, v1.4s + uzp1 v30.4s, v23.4s, v19.4s + mul v21.4s, v22.4s, v1.4s + uzp1 v18.4s, v7.4s, v5.4s + mul v22.4s, v30.4s, v1.4s + smlsl2 v26.2d, v17.4s, v0.4s + mul v3.4s, v18.4s, v1.4s + smlsl v31.2d, v17.2s, v0.2s + smlsl v24.2d, v21.2s, v0.2s + smlsl2 v19.2d, v22.4s, v0.4s + smlsl2 v5.2d, v3.4s, v0.4s + smlsl v7.2d, v3.2s, v0.2s + uzp2 v25.4s, v31.4s, v26.4s + smlsl v23.2d, v22.2s, v0.2s + smlsl2 v6.2d, v21.4s, v0.4s + str q25, [x0, #0x30] + uzp2 v26.4s, v7.4s, v5.4s + uzp2 v19.4s, v23.4s, v19.4s + uzp2 v21.4s, v24.4s, v6.4s + str q26, [x0], #0x40 + stur q19, [x0, #-0x30] + stur q21, [x0, #-0x20] ret .cfi_endproc