diff --git a/docs/algorithms/kem/ml_kem.md b/docs/algorithms/kem/ml_kem.md index 7343bf24f..fe69e7ef0 100644 --- a/docs/algorithms/kem/ml_kem.md +++ b/docs/algorithms/kem/ml_kem.md @@ -7,9 +7,9 @@ - **Authors' website**: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 - **Specification version**: ML-KEM. - **Primary Source**: - - **Source**: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd with copy_from_upstream patches + - **Source**: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f with copy_from_upstream patches - **Implementation license (SPDX-Identifier)**: CC0-1.0 or Apache-2.0 -- **Optimized Implementation sources**: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd with copy_from_upstream patches +- **Optimized Implementation sources**: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f with copy_from_upstream patches - **cupqc-cuda**: - **Source**: https://github.com/praveksharma/cupqc-mlkem/commit/b026f4e5475cd9c20c2082c7d9bad80e5b0ba89e - **Implementation license (SPDX-Identifier)**: Apache-2.0 diff --git a/docs/algorithms/kem/ml_kem.yml b/docs/algorithms/kem/ml_kem.yml index 62400591b..4042f6f60 100644 --- a/docs/algorithms/kem/ml_kem.yml +++ b/docs/algorithms/kem/ml_kem.yml @@ -17,7 +17,7 @@ website: https://pq-crystals.org/kyber/ and https://csrc.nist.gov/pubs/fips/203 nist-round: FIPS203 spec-version: ML-KEM primary-upstream: - source: https://github.com/pq-code-package/mlkem-native/commit/84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd + source: https://github.com/pq-code-package/mlkem-native/commit/d830bc22eb1613bbe38028cfefc33f1a52a40b2f with copy_from_upstream patches spdx-license-identifier: CC0-1.0 or Apache-2.0 optimized-upstreams: diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index bc08de94f..54fc50307 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -34,7 +34,7 @@ upstreams: name: mlkem-native git_url: https://github.com/pq-code-package/mlkem-native.git git_branch: main - git_commit: 84398e7230fa31ba4241f5eb36bdc3c1dbbd5bcd + git_commit: d830bc22eb1613bbe38028cfefc33f1a52a40b2f kem_meta_path: 'integration/liboqs/{pretty_name_full}_META.yml' kem_scheme_path: '.' patches: [mlkem-native-ml_kem.patch] diff --git a/src/kem/ml_kem/CMakeLists.txt b/src/kem/ml_kem/CMakeLists.txt index aabc775fa..bd2201513 100644 --- a/src/kem/ml_kem/CMakeLists.txt +++ b/src/kem/ml_kem/CMakeLists.txt @@ -15,7 +15,7 @@ if(OQS_ENABLE_KEM_ml_kem_512) endif() if(OQS_ENABLE_KEM_ml_kem_512_x86_64) - add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c) + add_library(ml_kem_512_x86_64 OBJECT mlkem-native_ml-kem-512_x86_64/compress.c mlkem-native_ml-kem-512_x86_64/debug.c mlkem-native_ml-kem-512_x86_64/indcpa.c mlkem-native_ml-kem-512_x86_64/kem.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-512_x86_64/poly.c mlkem-native_ml-kem-512_x86_64/poly_k.c mlkem-native_ml-kem-512_x86_64/sampling.c mlkem-native_ml-kem-512_x86_64/verify.c mlkem-native_ml-kem-512_x86_64/zetas.c) target_include_directories(ml_kem_512_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_x86_64) target_include_directories(ml_kem_512_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_512_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) @@ -24,7 +24,7 @@ if(OQS_ENABLE_KEM_ml_kem_512_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_512_aarch64) - add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) + add_library(ml_kem_512_aarch64 OBJECT mlkem-native_ml-kem-512_aarch64/compress.c mlkem-native_ml-kem-512_aarch64/debug.c mlkem-native_ml-kem-512_aarch64/indcpa.c mlkem-native_ml-kem-512_aarch64/kem.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-512_aarch64/poly.c mlkem-native_ml-kem-512_aarch64/poly_k.c mlkem-native_ml-kem-512_aarch64/sampling.c mlkem-native_ml-kem-512_aarch64/verify.c mlkem-native_ml-kem-512_aarch64/zetas.c) target_include_directories(ml_kem_512_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-512_aarch64) target_include_directories(ml_kem_512_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_512_aarch64 PUBLIC -DMLKEM_K=2 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM512_AARCH64_OPT) @@ -49,7 +49,7 @@ if(OQS_ENABLE_KEM_ml_kem_768) endif() if(OQS_ENABLE_KEM_ml_kem_768_x86_64) - add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c) + add_library(ml_kem_768_x86_64 OBJECT mlkem-native_ml-kem-768_x86_64/compress.c mlkem-native_ml-kem-768_x86_64/debug.c mlkem-native_ml-kem-768_x86_64/indcpa.c mlkem-native_ml-kem-768_x86_64/kem.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-768_x86_64/poly.c mlkem-native_ml-kem-768_x86_64/poly_k.c mlkem-native_ml-kem-768_x86_64/sampling.c mlkem-native_ml-kem-768_x86_64/verify.c mlkem-native_ml-kem-768_x86_64/zetas.c) target_include_directories(ml_kem_768_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_x86_64) target_include_directories(ml_kem_768_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_768_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) @@ -58,7 +58,7 @@ if(OQS_ENABLE_KEM_ml_kem_768_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_768_aarch64) - add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) + add_library(ml_kem_768_aarch64 OBJECT mlkem-native_ml-kem-768_aarch64/compress.c mlkem-native_ml-kem-768_aarch64/debug.c mlkem-native_ml-kem-768_aarch64/indcpa.c mlkem-native_ml-kem-768_aarch64/kem.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-768_aarch64/poly.c mlkem-native_ml-kem-768_aarch64/poly_k.c mlkem-native_ml-kem-768_aarch64/sampling.c mlkem-native_ml-kem-768_aarch64/verify.c mlkem-native_ml-kem-768_aarch64/zetas.c) target_include_directories(ml_kem_768_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-768_aarch64) target_include_directories(ml_kem_768_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_768_aarch64 PUBLIC -DMLKEM_K=3 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM768_AARCH64_OPT) @@ -83,7 +83,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024) endif() if(OQS_ENABLE_KEM_ml_kem_1024_x86_64) - add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c) + add_library(ml_kem_1024_x86_64 OBJECT mlkem-native_ml-kem-1024_x86_64/compress.c mlkem-native_ml-kem-1024_x86_64/debug.c mlkem-native_ml-kem-1024_x86_64/indcpa.c mlkem-native_ml-kem-1024_x86_64/kem.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/compress_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/consts.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_avx2.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S mlkem-native_ml-kem-1024_x86_64/poly.c mlkem-native_ml-kem-1024_x86_64/poly_k.c mlkem-native_ml-kem-1024_x86_64/sampling.c mlkem-native_ml-kem-1024_x86_64/verify.c mlkem-native_ml-kem-1024_x86_64/zetas.c) target_include_directories(ml_kem_1024_x86_64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_x86_64) target_include_directories(ml_kem_1024_x86_64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_1024_x86_64 PRIVATE -mavx2 -mbmi2 -mpopcnt ) @@ -92,7 +92,7 @@ if(OQS_ENABLE_KEM_ml_kem_1024_x86_64) endif() if(OQS_ENABLE_KEM_ml_kem_1024_aarch64) - add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) + add_library(ml_kem_1024_aarch64 OBJECT mlkem-native_ml-kem-1024_aarch64/compress.c mlkem-native_ml-kem-1024_aarch64/debug.c mlkem-native_ml-kem-1024_aarch64/indcpa.c mlkem-native_ml-kem-1024_aarch64/kem.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c mlkem-native_ml-kem-1024_aarch64/poly.c mlkem-native_ml-kem-1024_aarch64/poly_k.c mlkem-native_ml-kem-1024_aarch64/sampling.c mlkem-native_ml-kem-1024_aarch64/verify.c mlkem-native_ml-kem-1024_aarch64/zetas.c) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/mlkem-native_ml-kem-1024_aarch64) target_include_directories(ml_kem_1024_aarch64 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) target_compile_options(ml_kem_1024_aarch64 PUBLIC -DMLKEM_K=4 -DFORCE_AARCH64 -DMLKEM_NATIVE_ARITH_BACKEND_NAME=AARCH64_OPT -DMLKEM_USE_NATIVE_BACKEND_ARITH -DMLKEM_NAMESPACE_PREFIX=PQCP_MLKEM_NATIVE_MLKEM1024_AARCH64_OPT) diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md index e499a4a22..a420f05b6 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/README.md @@ -10,10 +10,9 @@ works: - _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303) -## Profiles -This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to -read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is -automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the +## Variants + +This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported -by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh). +by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh). diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h deleted file mode 100644 index f124702a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/clean.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_H - -/* Identifier for this backend so that source and assembly files - * in the build can be appropriately guarded. */ -#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN - -#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN - -/* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h index a7217163f..4a0243279 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/opt.h @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -/* ML-KEM arithmetic native profile for clean assembly */ - #ifdef MLKEM_NATIVE_ARITH_PROFILE_H #error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? #else diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c index 2c1bb31e1..23e7949d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/aarch64_zetas.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h index ed0825892..60779598d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; extern const uint8_t rej_uniform_table[]; -#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) -void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt) void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean) -void intt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt) void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) -unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, - const uint8_t *table); - -#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) -void poly_reduce_asm_clean(int16_t *); - #define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt) void poly_reduce_asm_opt(int16_t *); -#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean) -void poly_tomont_asm_clean(int16_t *); - #define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt) void poly_tomont_asm_opt(int16_t *); -#define poly_mulcache_compute_asm_clean \ - MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *, - const int16_t *, const int16_t *); - - #define poly_mulcache_compute_asm_opt \ MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *, const int16_t *); -#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean) -void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a); - #define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt) void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a); -#define polyvec_basemul_acc_montgomery_cached_asm_clean \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) -void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r, - const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); -#define polyvec_basemul_acc_montgomery_cached_asm_opt \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) -void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) +unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, + const uint8_t *table); #endif /* MLKEM_AARCH64_NATIVE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h deleted file mode 100644 index 4be90fb24..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/clean_impl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H - -#include "arith_native_aarch64.h" - -/* Set of primitives that this backend replaces */ -#define MLKEM_USE_NATIVE_NTT -#define MLKEM_USE_NATIVE_INTT -#define MLKEM_USE_NATIVE_POLY_REDUCE -#define MLKEM_USE_NATIVE_POLY_TOMONT -#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE -#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED -#define MLKEM_USE_NATIVE_POLY_TOBYTES -#define MLKEM_USE_NATIVE_REJ_UNIFORM - -static INLINE void ntt_native(int16_t data[MLKEM_N]) -{ - ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); -} - -static INLINE void intt_native(int16_t data[MLKEM_N]) -{ - intt_asm_clean(data, aarch64_invntt_zetas_layer01234, - aarch64_invntt_zetas_layer56); -} - -static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) -{ - poly_reduce_asm_clean(data); -} - -static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) -{ - poly_tomont_asm_clean(data); -} - -static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], - const int16_t y[MLKEM_N]) -{ - poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, - aarch64_zetas_mulcache_twisted_native); -} - -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], - const int16_t b[MLKEM_K * MLKEM_N], - const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) -{ - polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); -} - -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const int16_t a[MLKEM_N]) -{ - poly_tobytes_asm_clean(r, a); -} - -static INLINE int rej_uniform_native(int16_t *r, unsigned len, - const uint8_t *buf, unsigned buflen) -{ - if (len != MLKEM_N || buflen % 24 != 0) - { - return -1; - } - return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); -} - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S deleted file mode 100644 index b0ae1ad46..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_clean.S +++ /dev/null @@ -1,389 +0,0 @@ -/// Copyright (c) 2024 The mlkem-native project authors -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro gs_butterfly a, b, root, idx0, idx1 - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmodq \b, tmp, \root, \idx0, \idx1 -.endm - -.macro gs_butterfly_v a, b, root, root_twisted - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmod \b, tmp, \root, \root_twisted -.endm - -.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 - mulmod \dst0, \src0, ninv, ninv_tw - mulmod \dst1, \src1, ninv, ninv_tw - mulmod \dst2, \src2, ninv, ninv_tw - mulmod \dst3, \src3, ninv, ninv_tw -.endm - -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - -// For comparability reasons, the output range for the coefficients of this -// invNTT code is supposed to match the implementation from PQClean on commit -// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients -// are NOT canonically reduced. The ordering of the coefficients is canonical, -// also matching PQClean. - -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_clean) - - in .req x0 - r01234_ptr .req x1 - r56_ptr .req x2 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - consts .req v7 - q_consts .req q7 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - ninv .req v29 - ninv_tw .req v30 - -.balign 4 -MLKEM_ASM_NAMESPACE(intt_asm_clean): - push_stack - - // Setup constants - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - mov wtmp, #512 - dup ninv.8h, wtmp - mov wtmp, #5040 - dup ninv_tw.8h, wtmp - - mov inp, in - mov count, #8 - -scale_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - // Bounds: Absolute value < q - - str q_data0, [inp], #64 - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, scale_start - - mov inp, in - mov count, #8 - - .p2align 2 -layer3456_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - transpose4 data // manual ld4 - - load_next_roots_56 - - // Layer 7 - gs_butterfly_v data0, data1, root1, root1_tw - gs_butterfly_v data2, data3, root2, root2_tw - // Bounds: - // data0, data2: < 2q - // data1, data3: < q - - // Layer 6 - gs_butterfly_v data0, data2, root0, root0_tw - gs_butterfly_v data1, data3, root0, root0_tw - // Bounds: - // data0: < 4q - // data1: < 2q - // data2, data3: < q - - transpose4 data - - load_next_roots_34 - - // Layer 5 - gs_butterfly data0, data1, root0, 2, 3 - gs_butterfly data2, data3, root0, 4, 5 - // Max bound: 8q - - // Not all of those reductions are needed, but the bounds tracking - // is easier if we uniformly reduce at this point. - barrett_reduce data0 - barrett_reduce data2 - barrett_reduce data1 - barrett_reduce data3 - - // Bounds: q/2 - - // Layer 4 - gs_butterfly data0, data2, root0, 0, 1 - gs_butterfly data1, data3, root0, 0, 1 - // Bounds: < q - - str q_data0, [inp], #(64) - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, layer3456_start - - // --------------------------------------------------------------------- - - mov count, #4 - load_roots_012 - - .p2align 2 - -layer012_start: - - ldr q_data0, [in, #0] - ldr q_data1, [in, #(1*(512/8))] - ldr q_data2, [in, #(2*(512/8))] - ldr q_data3, [in, #(3*(512/8))] - ldr q_data4, [in, #(4*(512/8))] - ldr q_data5, [in, #(5*(512/8))] - ldr q_data6, [in, #(6*(512/8))] - ldr q_data7, [in, #(7*(512/8))] - - gs_butterfly data0, data1, root0, 6, 7 - gs_butterfly data2, data3, root1, 0, 1 - gs_butterfly data4, data5, root1, 2, 3 - gs_butterfly data6, data7, root1, 4, 5 - - gs_butterfly data0, data2, root0, 2, 3 - gs_butterfly data1, data3, root0, 2, 3 - gs_butterfly data4, data6, root0, 4, 5 - gs_butterfly data5, data7, root0, 4, 5 - - gs_butterfly data0, data4, root0, 0, 1 - gs_butterfly data1, data5, root0, 0, 1 - gs_butterfly data2, data6, root0, 0, 1 - gs_butterfly data3, data7, root0, 0, 1 - - // Bounds: < 8q - - str q_data4, [in, #(4*(512/8))] - str q_data5, [in, #(5*(512/8))] - str q_data6, [in, #(6*(512/8))] - str q_data7, [in, #(7*(512/8))] - - str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(512/8))] - str q_data2, [in, #(-16 + 2*(512/8))] - str q_data3, [in, #(-16 + 3*(512/8))] - - subs count, count, #1 - cbnz count, layer012_start - - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq in - .unreq r01234_ptr - .unreq r56_ptr - .unreq inp - .unreq count - .unreq wtmp - .unreq data0 - .unreq data1 - .unreq data2 - .unreq data3 - .unreq data4 - .unreq data5 - .unreq data6 - .unreq data7 - .unreq q_data0 - .unreq q_data1 - .unreq q_data2 - .unreq q_data3 - .unreq q_data4 - .unreq q_data5 - .unreq q_data6 - .unreq q_data7 - .unreq root0 - .unreq root1 - .unreq root2 - .unreq root0_tw - .unreq root1_tw - .unreq root2_tw - .unreq consts - .unreq q_consts - .unreq q_root0 - .unreq q_root1 - .unreq q_root2 - .unreq q_root0_tw - .unreq q_root1_tw - .unreq q_root2_tw - .unreq tmp - .unreq t0 - .unreq t1 - .unreq t2 - .unreq t3 - .unreq ninv - .unreq ninv_tw - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S index 191de3c4d..0f9e44307 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/intt_opt.S @@ -25,6 +25,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -139,9 +140,6 @@ // are NOT canonically reduced. The ordering of the coefficients is canonical, // also matching PQClean. -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_opt) - in .req x0 r01234_ptr .req x1 r56_ptr .req x2 @@ -194,7 +192,9 @@ ninv .req v29 ninv_tw .req v30 -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) + .balign 4 MLKEM_ASM_NAMESPACE(intt_asm_opt): push_stack @@ -1042,4 +1042,5 @@ layer012_start: .unreq ninv .unreq ninv_tw +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S deleted file mode 100644 index 4f844e212..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/ntt_clean.S +++ /dev/null @@ -1,317 +0,0 @@ -/// -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// Copyright (c) 2024 The mlkem-native project authors -// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro ct_butterfly a, b, root, idx0, idx1 - mulmodq tmp, \b, \root, \idx0, \idx1 - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro ct_butterfly_v a, b, root, root_twisted - mulmod tmp, \b, \root, \root_twisted - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - // Arguments - in .req x0 // Input/output buffer - r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4 - r56_ptr .req x2 // twiddles for layer 5,6 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - consts .req v7 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - .text - .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) - - .balign 4 -MLKEM_ASM_NAMESPACE(ntt_asm_clean): - push_stack - - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - - mov inp, in - mov count, #4 - - load_roots_012 - - .p2align 2 - - // Bounds reasoning: - // - There are 7 layers - // - When passing from layer N to layer N+1, each layer-N value - // is modified through the addition/subtraction of a Montgomery - // product of a twiddle of absolute value < q/2 and a layer-N value. - // - Recalling that for C such that |a| < C * q and |t|> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_clean_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 -poly_tomont_asm_loop: - - ldr q_data, [src], #64 - mulmod res, data, factor, factor_t - str q_res, [src, #-64] - - ldr q_data, [src, #-48] - mulmod res, data, factor, factor_t - str q_res, [src, #-48] - - ldr q_data, [src, #-32] - mulmod res, data, factor, factor_t - str q_res, [src, #-32] - - ldr q_data, [src, #-16] - mulmod res, data, factor, factor_t - str q_res, [src, #-16] - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S new file mode 100644 index 000000000..a3593b7fd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/******************************************** + * poly_mulcache_compute() * + ********************************************/ + + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + wtmp .req w5 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + modulus_twisted .req v7 + + .text + .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #20159 + dup modulus_twisted.8h, wtmp + + mov count, #16 + // Instructions: 7 + // Expected cycles: 12 + // Expected IPC: 0.58 + + // Cycle bound: 12.0 + // IPC bound: 0.58 + + // Wall time: 0.01s + // User time: 0.01s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... + + sub count, count, #1 +poly_mulcache_compute_asm_opt_loop: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 + + // Cycle bound: 13.0 + // IPC bound: 0.69 + + // Wall time: 0.09s + // User time: 0.09s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, poly_mulcache_compute_asm_opt_loop + // Instructions: 2 + // Expected cycles: 5 + // Expected IPC: 0.40 + + // Cycle bound: 5.0 + // IPC bound: 0.40 + + // Wall time: 0.00s + // User time: 0.00s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... + + + ret + + .unreq cache_ptr + .unreq data_ptr + .unreq zeta_ptr + .unreq zeta_twisted_ptr + .unreq count + .unreq wtmp + + .unreq data_odd + .unreq zeta + .unreq q_zeta + .unreq zeta_twisted + .unreq q_zeta_twisted + + .unreq tmp0 + .unreq q_tmp0 + .unreq tmp1 + .unreq q_tmp1 + .unreq dst + .unreq q_dst + + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S deleted file mode 100644 index 79605818f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_opt.S +++ /dev/null @@ -1,670 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -/* - * Some modular arithmetic macros - */ - -/* Barrett reduction */ -.macro barrett_reduce a - sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] - srshr tmp.8h, tmp.8h, #11 - mls \a\().8h, tmp.8h, modulus.h[0] -.endm - -/* Montgomery multiplication, with precomputed Montgomery twist - * Expects modulus in consts.h[0]. */ -.macro mulmod dst, src, const, const_twisted - sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, modulus.h[0] -.endm - -/* Turns signed-canonical to unsigned canonical representative - * through conditional addition of the modulus. - * - * Expected modulus in `modulus`. */ -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h -.endm - -/********************************** - * poly_reduce() * - **********************************/ - -.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) - - ptr .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - - tmp .req v1 - mask .req v2 - modulus .req v3 - modulus_twisted .req v4 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 - - // Cycle bound: 22.0 - // IPC bound: 0.68 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #32] // *............................. - ldr q23, [x0, #48] // ..*........................... - sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... - sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... - srshr v7.8H, v7.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v7.8H, v3.H[0] // ...........*.................. - mls v23.8H, v30.8H, v3.H[0] // .............*................ - ldr q5, [x0, #16] // ..............*............... - sshr v7.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v23.8H, #15 // .................*............ - and v7.16B, v3.16B, v7.16B // ..................*........... - add v21.8H, v21.8H, v7.8H // ...................*.......... - and v7.16B, v3.16B, v30.16B // ....................*......... - add v16.8H, v23.8H, v7.8H // .....................*........ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x0, #32] // *.............................. - // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... - // ldr q2, [x0, #48] // ..*............................ - // srshr v19.8H, v22.8H, #11 // ........*...................... - // mls v30.8H, v19.8H, v3.H[0] // ...........*................... - // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ - // sshr v31.8H, v30.8H, #15 // ................*.............. - // srshr v25.8H, v25.8H, #11 // ..........*.................... - // and v18.16B, v3.16B, v31.16B // ..................*............ - // mls v2.8H, v25.8H, v3.H[0] // .............*................. - // add v21.8H, v30.8H, v18.8H // ...................*........... - // ldr q5, [x0, #16] // ..............*................ - // sshr v18.8H, v2.8H, #15 // .................*............. - // and v27.16B, v3.16B, v18.16B // ....................*.......... - // add v16.8H, v2.8H, v27.8H // .....................*......... - - sub count, count, #1 -1: - // Instructions: 32 - // Expected cycles: 36 - // Expected IPC: 0.89 - - // Cycle bound: 36.0 - // IPC bound: 0.89 - - // Wall time: 1.05s - // User time: 1.05s - - // -------- cycle (expected) ---------> - // 0 25 - // |------------------------|---------- - ldr q6, [x0], #64 // *................................... - ldr q30, [x0, #32] // ..e................................. - sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... - sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. - sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. - str q16, [x0, #-16] // .......*............................ - srshr v20.8H, v31.8H, #11 // ........*........................... - srshr v28.8H, v29.8H, #11 // .........*.......................... - str q21, [x0, #-32] // ..........*......................... - mls v6.8H, v20.8H, v3.H[0] // ...........*........................ - mls v5.8H, v28.8H, v3.H[0] // ............*....................... - ldr q2, [x0, #48] // .............e...................... - sshr v31.8H, v6.8H, #15 // ...............*.................... - srshr v19.8H, v22.8H, #11 // ................e................... - and v22.16B, v3.16B, v31.16B // .................*.................. - add v0.8H, v6.8H, v22.8H // ..................*................. - mls v30.8H, v19.8H, v3.H[0] // ...................e................ - sshr v26.8H, v5.8H, #15 // ....................*............... - sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. - and v17.16B, v3.16B, v26.16B // ......................*............. - add v1.8H, v5.8H, v17.8H // .......................*............ - sshr v31.8H, v30.8H, #15 // ........................e........... - srshr v25.8H, v25.8H, #11 // .........................e.......... - str q1, [x0, #-48] // ..........................*......... - and v18.16B, v3.16B, v31.16B // ...........................e........ - mls v2.8H, v25.8H, v3.H[0] // ............................e....... - add v21.8H, v30.8H, v18.8H // .............................e...... - ldr q5, [x0, #16] // ..............................e..... - sshr v18.8H, v2.8H, #15 // ................................e... - str q0, [x0, #-64] // .................................*.. - and v27.16B, v3.16B, v18.16B // ..................................e. - add v16.8H, v2.8H, v27.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ..................................*................................. - // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. - // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... - // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... - // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. - // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ............................e.....'.............................~... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ - // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ - // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... - // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. - // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... - // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... - // str q0, [x0, #-48] // ........................~.........'.........................*....... - // ldr q0, [x0, #-32] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. - // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. - // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... - // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... - // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... - // str q0, [x0, #-32] // ........~.........................'.........*....................... - // ldr q0, [x0, #-16] // ...........e......................'............~.................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... - // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v2.16b, v3.16b, v2.16b // ................................e.'................................. - // add v0.8h, v0.8h, v2.8h // .................................e'................................. - // str q0, [x0, #-16] // .....~............................'......*.......................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 17 - // Expected cycles: 23 - // Expected IPC: 0.74 - - // Cycle bound: 23.0 - // IPC bound: 0.74 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. - ldr q24, [x0], #64 // .*............................ - str q21, [x0, #-32] // ...*.......................... - srshr v20.8H, v20.8H, #11 // ....*......................... - sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ - str q16, [x0, #-16] // ......*....................... - mls v5.8H, v20.8H, v3.H[0] // .......*...................... - srshr v20.8H, v25.8H, #11 // .........*.................... - sshr v2.8H, v5.8H, #15 // ...........*.................. - mls v24.8H, v20.8H, v3.H[0] // ............*................. - and v20.16B, v3.16B, v2.16B // .............*................ - add v31.8H, v5.8H, v20.8H // ..............*............... - sshr v20.8H, v24.8H, #15 // ................*............. - str q31, [x0, #-48] // .................*............ - and v31.16B, v3.16B, v20.16B // ..................*........... - add v24.8H, v24.8H, v31.8H // ...................*.......... - str q24, [x0, #-64] // ......................*....... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q6, [x0], #64 // .*............................. - // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... - // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. - // str q16, [x0, #-16] // ......*........................ - // srshr v20.8H, v31.8H, #11 // .........*..................... - // srshr v28.8H, v29.8H, #11 // ....*.......................... - // str q21, [x0, #-32] // ...*........................... - // mls v6.8H, v20.8H, v3.H[0] // ............*.................. - // mls v5.8H, v28.8H, v3.H[0] // .......*....................... - // sshr v31.8H, v6.8H, #15 // ................*.............. - // and v22.16B, v3.16B, v31.16B // ..................*............ - // add v0.8H, v6.8H, v22.8H // ...................*........... - // sshr v26.8H, v5.8H, #15 // ...........*................... - // and v17.16B, v3.16B, v26.16B // .............*................. - // add v1.8H, v5.8H, v17.8H // ..............*................ - // str q1, [x0, #-48] // .................*............. - // str q0, [x0, #-64] // ......................*........ - - - ret - - .unreq ptr - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - - .unreq tmp - .unreq mask - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_mulcache_compute() * - ********************************************/ - -.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - count .req x4 - wtmp .req w5 - - data_odd .req v0 - zeta .req v1 - q_zeta .req q1 - zeta_twisted .req v2 - q_zeta_twisted .req q2 - - tmp0 .req v3 - q_tmp0 .req q3 - tmp1 .req v4 - q_tmp1 .req q4 - dst .req v5 - q_dst .req q5 - - modulus .req v6 - modulus_twisted .req v7 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #20159 - dup modulus_twisted.8h, wtmp - - mov count, #16 - // Instructions: 7 - // Expected cycles: 12 - // Expected IPC: 0.58 - - // Cycle bound: 12.0 - // IPC bound: 0.58 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q1, [x1, #16] // *............................. - ldr q27, [x1], #32 // ..*........................... - ldr q23, [x2], #16 // ....*......................... - uzp2 v27.8H, v27.8H, v1.8H // ......*....................... - ldr q1, [x3], #16 // .......*...................... - mul v2.8H, v27.8H, v23.8H // .........*.................... - sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q29, [x1, #16] // *.............................. - // ldr q21, [x2], #16 // ....*.......................... - // ldr q27, [x1], #32 // ..*............................ - // ldr q7, [x3], #16 // .......*....................... - // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ - // mul v2.8H, v28.8H, v21.8H // .........*..................... - // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... - - sub count, count, #1 -1: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 - - // Cycle bound: 13.0 - // IPC bound: 0.69 - - // Wall time: 0.09s - // User time: 0.09s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q29, [x1, #16] // e............................. - ldr q21, [x2], #16 // ..e........................... - mls v2.8H, v27.8H, v6.H[0] // ....*......................... - ldr q27, [x1], #32 // .....e........................ - ldr q7, [x3], #16 // .......e...................... - uzp2 v28.8H, v27.8H, v29.8H // .........e.................... - str q2, [x0], #16 // ..........*................... - mul v2.8H, v28.8H, v21.8H // ...........e.................. - sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q3, [x1], #32 // .....e.......'....~.......'.... - // ldr q4, [x1, #-16] // e............~............~.... - // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. - // ldr q2, [x3], #16 // .......e.....'......~.....'.... - // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... - // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... - // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... - // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... - // str q5, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 2 - // Expected cycles: 5 - // Expected IPC: 0.40 - - // Cycle bound: 5.0 - // IPC bound: 0.40 - - // Wall time: 0.00s - // User time: 0.00s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v2.8H, v27.8H, v6.H[0] // *............................. - str q2, [x0], #16 // ....*......................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v2.8H, v27.8H, v6.H[0] // *.............................. - // str q2, [x0], #16 // ....*.......................... - - - ret - - .unreq cache_ptr - .unreq data_ptr - .unreq zeta_ptr - .unreq zeta_twisted_ptr - .unreq count - .unreq wtmp - - .unreq data_odd - .unreq zeta - .unreq q_zeta - .unreq zeta_twisted - .unreq q_zeta_twisted - - .unreq tmp0 - .unreq q_tmp0 - .unreq tmp1 - .unreq q_tmp1 - .unreq dst - .unreq q_dst - - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_tobytes() * - ********************************************/ -.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) - - data0 .req v0 - data1 .req v1 - out0 .req v2 - out1 .req v3 - out2 .req v4 - tmp .req v5 - - dst .req x0 - src .req x1 - count .req x2 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): - - mov count, #16 -poly_tobytes_asm_opt_asm_loop_start: - ld2 {data0.8h, data1.8h}, [src], #32 - - // r[3 * i + 0] = (t0 >> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_opt_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 - // Instructions: 5 - // Expected cycles: 7 - // Expected IPC: 0.71 - // - // Cycle bound: 7.0 - // IPC bound: 0.71 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q26, [x0, #48] // *............................. - ldr q23, [x0, #16] // ..*........................... - mul v17.8H, v26.8H, v2.8H // ....*......................... - sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ - ldr q27, [x0, #32] // ......*....................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q7, [x0, #48] // *.............................. - // ldr q23, [x0, #16] // ..*............................ - // mul v17.8H, v7.8H, v2.8H // ....*.......................... - // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... - // ldr q27, [x0, #32] // ......*........................ - - sub count, count, #1 -1: - // Instructions: 20 - // Expected cycles: 24 - // Expected IPC: 0.83 - // - // Cycle bound: 24.0 - // IPC bound: 0.83 - // - // Wall time: 0.73s - // User time: 0.73s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ - ldr q7, [x0], #64 // ..*........................... - str q17, [x0, #-16] // ....*......................... - sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ - sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... - mul v25.8H, v23.8H, v2.8H // .......*...................... - mul v0.8H, v7.8H, v2.8H // ........*..................... - mul v26.8H, v27.8H, v2.8H // .........*.................... - ldr q7, [x0, #48] // ..........e................... - mls v25.8H, v5.8H, v4.H[0] // ............*................. - ldr q23, [x0, #16] // .............e................ - mls v26.8H, v29.8H, v4.H[0] // ...............*.............. - mls v0.8H, v19.8H, v4.H[0] // ................*............. - str q25, [x0, #-48] // .................*............ - mul v17.8H, v7.8H, v2.8H // ..................e........... - sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... - str q0, [x0, #-64] // ....................*......... - ldr q27, [x0, #32] // .....................e........ - str q26, [x0, #-32] // .......................*...... - - // --------- cycle (expected) ----------> - // 0 25 - // |------------------------|------------ - // ldr q0, [x0], #64 // ..............'.*..................... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. - // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... - // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... - // str q1, [x0, #-64] // ..........~...'...................*... - // ldr q0, [x0, #-48] // ...e..........'............~.......... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... - // mul v1.8h, v0.8h, v2.8h // ..............'......*................ - // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... - // str q1, [x0, #-48] // .......~......'................*...... - // ldr q0, [x0, #-32] // ...........e..'....................~.. - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. - // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. - // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ - // str q1, [x0, #-32] // .............~'......................* - // ldr q0, [x0, #-16] // e.............'.........~............. - // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... - // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... - // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... - // str q1, [x0, #-16] // ..............'...*................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 15 - // Expected cycles: 18 - // Expected IPC: 0.83 - // - // Cycle bound: 18.0 - // IPC bound: 0.83 - // - // Wall time: 0.07s - // User time: 0.07s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ - mul v26.8H, v23.8H, v2.8H // ..*........................... - sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... - ldr q23, [x0], #64 // ....*......................... - mul v27.8H, v27.8H, v2.8H // ......*....................... - mls v26.8H, v7.8H, v4.H[0] // .......*...................... - sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... - mul v23.8H, v23.8H, v2.8H // .........*.................... - str q17, [x0, #-16] // ..........*................... - mls v27.8H, v25.8H, v4.H[0] // ...........*.................. - str q26, [x0, #-48] // ............*................. - mls v23.8H, v7.8H, v4.H[0] // .............*................ - str q27, [x0, #-32] // ...............*.............. - str q23, [x0, #-64] // .................*............ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v17.8H, v7.8H, v4.H[0] // *.............................. - // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. - // ldr q7, [x0], #64 // ....*.......................... - // str q17, [x0, #-16] // ..........*.................... - // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... - // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... - // mul v25.8H, v23.8H, v2.8H // ..*............................ - // mul v0.8H, v7.8H, v2.8H // .........*..................... - // mul v26.8H, v27.8H, v2.8H // ......*........................ - // mls v25.8H, v5.8H, v4.H[0] // .......*....................... - // mls v26.8H, v29.8H, v4.H[0] // ...........*................... - // mls v0.8H, v19.8H, v4.H[0] // .............*................. - // str q25, [x0, #-48] // ............*.................. - // str q0, [x0, #-64] // .................*............. - // str q26, [x0, #-32] // ...............*............... - - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S new file mode 100644 index 000000000..410950730 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_reduce_asm_opt.S @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h +.endm + +/********************************** + * poly_reduce() * + **********************************/ + + ptr .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + modulus_twisted .req v4 + + .text + .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov count, #8 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 + + // Cycle bound: 22.0 + // IPC bound: 0.68 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... + + sub count, count, #1 +poly_reduce_asm_opt_loop: + // Instructions: 32 + // Expected cycles: 36 + // Expected IPC: 0.89 + + // Cycle bound: 36.0 + // IPC bound: 0.89 + + // Wall time: 1.05s + // User time: 1.05s + + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, poly_reduce_asm_opt_loop + // Instructions: 17 + // Expected cycles: 23 + // Expected IPC: 0.74 + + // Cycle bound: 23.0 + // IPC bound: 0.74 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ + str q21, [x0, #-32] // ...*.......................... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ + + + ret + + .unreq ptr + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + + .unreq tmp + .unreq mask + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S new file mode 100644 index 000000000..bc33afd43 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/******************************************** + * poly_tobytes() * + ********************************************/ + + data0 .req v0 + data1 .req v1 + out0 .req v2 + out1 .req v3 + out2 .req v4 + tmp .req v5 + + dst .req x0 + src .req x1 + count .req x2 + + .text + .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): + + mov count, #16 +poly_tobytes_asm_opt_asm_loop_start: + ld2 {data0.8h, data1.8h}, [src], #32 + + // r[3 * i + 0] = (t0 >> 0); + xtn out0.8b, data0.8h + + // r[3 * i + 1] = (t0 >> 8); + shrn out1.8b, data0.8h, #8 + xtn tmp.8b, data1.8h + // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + sli out1.8b, tmp.8b, #4 + + // r[3 * i + 2] = (t1 >> 4); + shrn out2.8b, data1.8h, #4 + + st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 + + subs count, count, #1 + cbnz count, poly_tobytes_asm_opt_asm_loop_start + ret + + .unreq data0 + .unreq data1 + .unreq out0 + .unreq out1 + .unreq out2 + .unreq tmp + .unreq dst + .unreq src + .unreq count + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S new file mode 100644 index 000000000..bcbff9adb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/poly_tomont_asm_opt.S @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/********************************** + * poly_tomont() * + **********************************/ + + src .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 + + factor .req v2 + factor_t .req v3 + modulus .req v4 + modulus_twisted .req v5 + + tmp0 .req v6 + + + .text + .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov wtmp, #-1044 // 2^16 % 3329 + dup factor.8h, wtmp + + mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) + dup factor_t.8h, wtmp + + mov count, #8 + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ + + sub count, count, #1 +poly_tomont_asm_opt_loop: + // Instructions: 20 + // Expected cycles: 24 + // Expected IPC: 0.83 + // + // Cycle bound: 24.0 + // IPC bound: 0.83 + // + // Wall time: 0.73s + // User time: 0.73s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, poly_tomont_asm_opt_loop + // Instructions: 15 + // Expected cycles: 18 + // Expected IPC: 0.83 + // + // Cycle bound: 18.0 + // IPC bound: 0.83 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... + + + ret + + .unreq src + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq factor_t + .unreq modulus + .unreq modulus_twisted + + .unreq tmp0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S new file mode 100644 index 000000000..e336b92cb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt): + push_stack + + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S new file mode 100644 index 000000000..1c30ed6aa --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S new file mode 100644 index 000000000..c3d70ed42 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + add a3_ptr, a0_ptr, #(3 * 512) + add b3_ptr, b0_ptr, #(3 * 512) + add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + + mov count, #(MLKEM_N / 16) + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop + + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S deleted file mode 100644 index 94f0889b7..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_clean.S +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -// -// AArch64 re-implementation of the asymmetric base multiplication from: -// -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. -// -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) -k2_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k2_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) -k3_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k3_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - // - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) -k4_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k4_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq wtmp - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S deleted file mode 100644 index 275ca06d2..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/polyvec_opt.S +++ /dev/null @@ -1,1606 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// AArch64 re-implementation of the asymmetric base multiplication from: - -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. - -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 94 - // Expected IPC: 0.80 - - // Cycle bound: 94.0 - // IPC bound: 0.80 - - // Wall time: 1.49s - // User time: 1.49s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q9, [x4], #32 // *.......................................................................... - ldr q5, [x4, #-16] // ......*.................................................................... - ldr q11, [x5], #32 // .*......................................................................... - uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. - uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... - ldr q5, [x2], #32 // ..*........................................................................ - ldr q7, [x5, #-16] // ..............*............................................................ - ldr q21, [x2, #-16] // ...*....................................................................... - uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... - uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ - uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... - uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... - ldr q21, [x1], #32 // .......*................................................................... - ldr q25, [x1, #-16] // ........*.................................................................. - ld1 {v6.8H}, [x3], #16 // ............................*.............................................. - uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ - uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... - smull v25.4S, v26.4H, v5.4H // ............*.............................................................. - smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. - smull v19.4S, v26.4H, v7.4H // ..........................*................................................ - smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ - smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... - smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... - smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... - smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... - smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... - smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... - smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... - smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... - ld1 {v23.8H}, [x6], #16 // ........................*.................................................. - smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... - smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... - smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... - smlal v19.4S, v9.4H, v23.4H // .........................................*................................. - ldr q9, [x4], #32 // ...............................*........................................... - uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. - uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. - mul v11.8H, v11.8H, v2.8H // ...........................*............................................... - mul v23.8H, v23.8H, v2.8H // ..............................................*............................ - ldr q7, [x5], #32 // ................................*.......................................... - smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. - smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ - ldr q11, [x2], #32 // .....................................*..................................... - ldr q21, [x2, #-16] // ........................................*.................................. - ldr q6, [x4, #-16] // ...............................................*........................... - uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... - ldr q10, [x1], #32 // ................................................*.......................... - ldr q29, [x1, #-16] // .................................................*......................... - uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. - uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... - uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... - uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... - smull v12.4S, v3.4H, v11.4H // ......................................................*.................... - smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... - ldr q21, [x5, #-16] // ........................................................*.................. - smlal v12.4S, v10.4H, v17.4H // .........................................................*................. - smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ - uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... - uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. - smlal v12.4S, v13.4H, v29.4H // .............................................................*............. - smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ - uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... - smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ - smlal v12.4S, v28.4H, v15.4H // .................................................................*......... - smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ - smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... - uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ - smull v23.4S, v3.4H, v17.4H // ......................................................................*.... - uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... - uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... - mul v14.8H, v9.8H, v2.8H // .......................................................................*... - ld1 {v22.8H}, [x6], #16 // ...................................................................*....... - zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* - ld1 {v4.8H}, [x3], #16 // .........................................................................*. - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q18, [x4], #32 // *.......................................................................... - // ldr q30, [x5], #32 // ..*........................................................................ - // ldr q8, [x2], #32 // .....*..................................................................... - // ldr q9, [x2, #-16] // .......*................................................................... - // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ - // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... - // ldr q19, [x4, #-16] // .*......................................................................... - // ldr q29, [x1], #32 // ............*.............................................................. - // ldr q12, [x1, #-16] // .............*............................................................. - // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... - // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... - // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... - // smull v12.4S, v3.4H, v4.4H // .................*......................................................... - // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ - // ldr q5, [x5, #-16] // ......*.................................................................... - // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... - // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... - // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. - // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. - // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. - // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ - // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... - // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ - // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... - // ld1 {v22.8H}, [x6], #16 // .............................*............................................. - // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... - // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... - // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... - // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ - // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. - // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... - // ldr q18, [x4], #32 // ..................................*........................................ - // ldr q30, [x5], #32 // .......................................*................................... - // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. - // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. - // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... - // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. - // ldr q8, [x2], #32 // ..........................................*................................ - // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... - // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... - // ldr q9, [x2, #-16] // ...........................................*............................... - // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... - // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ - // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. - // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... - // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... - // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... - // ldr q19, [x4, #-16] // ............................................*.............................. - // ldr q29, [x1], #32 // ..............................................*............................ - // ldr q12, [x1, #-16] // ...............................................*........................... - // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ - // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... - // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ - // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... - // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... - // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... - // ldr q5, [x5, #-16] // ......................................................*.................... - // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... - // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. - // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. - // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ - // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... - // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. - // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. - // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... - // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... - // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... - // ld1 {v22.8H}, [x6], #16 // .......................................................................*... - // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... - // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... - // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... - // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... - // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - // ld1 {v4.8H}, [x3], #16 // ..........................................................................* - // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - - sub count, count, #2 -1: - // Instructions: 48 - // Expected cycles: 58 - // Expected IPC: 0.83 - - // Cycle bound: 58.0 - // IPC bound: 0.83 - - // Wall time: 6.39s - // User time: 6.39s - - // -------------- original position --------------> - // 0 25 - // |------------------------|---------------------- - smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... - ldr q18, [x4], #32 // .................e.............................. - ldr q30, [x5], #32 // .....................e.......................... - smlal2 v20.4S, v10.8H, v4.8H // ............*................................... - smlal v12.4S, v14.4H, v0.4H // .........................................*...... - smlal v23.4S, v10.4H, v4.4H // ...........*.................................... - str q9, [x0, #16] // ...............................................l - smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... - ldr q8, [x2], #32 // ....e........................................... - smlal v23.4S, v13.4H, v15.4H // ..........................*..................... - smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. - zip1 v26.8H, v19.8H, v27.8H // ............................................l... - ldr q9, [x2, #-16] // .....e.......................................... - smlal v23.4S, v28.4H, v22.4H // ............................*................... - uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... - uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... - uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ - uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. - str q26, [x0], #32 // ..............................................l. - mul v31.8H, v5.8H, v2.8H // ...................................*............ - ldr q19, [x4, #-16] // ..................e............................. - ldr q29, [x1], #32 // e............................................... - ldr q12, [x1, #-16] // .e.............................................. - smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... - uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ - uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. - uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ - smull v12.4S, v3.4H, v4.4H // .............e.................................. - smull2 v11.4S, v3.8H, v4.8H // ..............e................................. - ldr q5, [x5, #-16] // ......................e......................... - smlal v12.4S, v10.4H, v17.4H // ...............e................................ - smlal2 v11.4S, v10.8H, v17.8H // ................e............................... - uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... - uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ - smlal v12.4S, v13.4H, v14.4H // ..............................e................. - smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ - uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... - smlal v23.4S, v31.4H, v0.4H // ....................................*........... - smlal v12.4S, v28.4H, v15.4H // ................................e............... - smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. - ld1 {v22.8H}, [x6], #16 // .........................e...................... - uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... - uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ - smull v23.4S, v3.4H, v17.4H // .........e...................................... - mul v14.8H, v1.8H, v2.8H // ........................................e....... - zip2 v9.8H, v19.8H, v27.8H // .............................................*.. - ld1 {v4.8H}, [x3], #16 // ........e....................................... - smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. - // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. - // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... - // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... - // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... - // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. - // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. - // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. - // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. - // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... - // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. - // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. - // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. - // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. - // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. - // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ - // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. - // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. - // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. - // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. - // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... - // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... - // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... - // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ - // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. - // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. - // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. - // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. - // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. - // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. - // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. - // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. - // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. - // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. - // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. - // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. - // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... - // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... - // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. - // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l - // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ - - sub count, count, #1 - cbnz count, 1b - // Instructions: 21 - // Expected cycles: 35 - // Expected IPC: 0.60 - - // Cycle bound: 35.0 - // IPC bound: 0.60 - - // Wall time: 0.08s - // User time: 0.08s - - // ----- original position -----> - // 0 25 - // |------------------------|---- - smull2 v5.4S, v3.8H, v17.8H // *............................. - smlal v12.4S, v14.4H, v0.4H // ..*........................... - smlal v23.4S, v10.4H, v4.4H // ...*.......................... - str q9, [x0, #16] // ....*......................... - smlal2 v5.4S, v10.8H, v4.8H // .*............................ - uzp2 v11.8H, v12.8H, v11.8H // ..........*................... - zip1 v9.8H, v19.8H, v27.8H // ........*..................... - smlal v23.4S, v13.4H, v15.4H // ......*....................... - smlal2 v5.4S, v13.8H, v15.8H // .....*........................ - str q9, [x0], #32 // ............*................. - smlal v23.4S, v28.4H, v22.4H // .........*.................... - smlal2 v5.4S, v28.8H, v22.8H // .......*...................... - uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. - mul v9.8H, v9.8H, v2.8H // .............*................ - smlal2 v5.4S, v9.8H, v0.8H // ..............*............... - smlal v23.4S, v9.4H, v0.4H // ...............*.............. - uzp2 v9.8H, v23.8H, v5.8H // ................*............. - zip2 v5.8H, v9.8H, v11.8H // .................*............ - zip1 v9.8H, v9.8H, v11.8H // ...................*.......... - str q5, [x0, #16] // ..................*........... - str q9, [x0], #32 // ....................*......... - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // smull2 v20.4S, v3.8H, v17.8H // *.............................. - // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... - // smlal v12.4S, v14.4H, v0.4H // .*............................. - // smlal v23.4S, v10.4H, v4.4H // ..*............................ - // str q9, [x0, #16] // ...*........................... - // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... - // smlal v23.4S, v13.4H, v15.4H // .......*....................... - // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... - // zip1 v26.8H, v19.8H, v27.8H // ......*........................ - // smlal v23.4S, v28.4H, v22.4H // ..........*.................... - // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... - // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. - // str q26, [x0], #32 // .........*..................... - // mul v31.8H, v5.8H, v2.8H // .............*................. - // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ - // smlal v23.4S, v31.4H, v0.4H // ...............*............... - // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. - // zip2 v9.8H, v19.8H, v27.8H // .................*............. - // str q9, [x0, #16] // ...................*........... - // zip1 v26.8H, v19.8H, v27.8H // ..................*............ - // str q26, [x0], #32 // ....................*.......... - - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 103 - // Expected IPC: 0.73 - - // Cycle bound: 103.0 - // IPC bound: 0.73 - - // Wall time: 0.94s - // User time: 0.94s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q7, [x2, #16] // *.......................................................................... - ldr q20, [x2], #32 // ..*........................................................................ - ldr q15, [x1, #16] // .*......................................................................... - uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... - uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... - ld1 {v20.8H}, [x3], #16 // ...*....................................................................... - ldr q30, [x1], #32 // ..............*............................................................ - ldr q11, [x4], #32 // ....*...................................................................... - uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... - uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ - smull v30.4S, v16.4H, v7.4H // ...................*....................................................... - smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... - smull v9.4S, v16.4H, v8.4H // .....................*..................................................... - smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... - smlal v30.4S, v15.4H, v8.4H // .......................*................................................... - smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. - smlal v9.4S, v15.4H, v20.4H // .........................*................................................. - smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ - ldr q20, [x4, #-16] // .....*..................................................................... - ldr q15, [x5], #32 // ......*.................................................................... - uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... - uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. - ldr q11, [x5, #-16] // .......*................................................................... - ld1 {v27.8H}, [x6], #16 // ........*.................................................................. - uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. - uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ - smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... - smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... - smlal v30.4S, v8.4H, v15.4H // .................................*......................................... - smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ - smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... - smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... - smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... - smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... - ldr q20, [x7], #32 // .........*................................................................. - ldr q15, [x7, #-16] // ..........*................................................................ - ldr q8, [x8], #32 // ...........*............................................................... - uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... - uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. - ldr q15, [x8, #-16] // ............*.............................................................. - ld1 {v27.8H}, [x9], #16 // .............*............................................................. - uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. - uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ - smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... - smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. - smlal v30.4S, v11.4H, v15.4H // .............................................*............................. - smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ - smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... - smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... - smlal v30.4S, v20.4H, v10.4H // .................................................*......................... - smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ - ldr q15, [x2], #32 // ...............................................................*........... - uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... - uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... - mul v20.8H, v20.8H, v2.8H // ......................................................*.................... - mul v8.8H, v8.8H, v2.8H // .......................................................*................... - ldr q21, [x4], #32 // .................................................................*......... - smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. - smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. - smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ - smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... - ldr q6, [x4, #-16] // ..................................................................*........ - uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. - uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. - ldr q16, [x2, #-16] // ...................................................*....................... - ldr q30, [x1, #16] // ..............................................................*............ - ld1 {v9.8H}, [x3], #16 // ................................................................*.......... - ldr q1, [x5], #32 // ...................................................................*....... - ldr q12, [x5, #-16] // ....................................................................*...... - ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - ldr q19, [x7], #32 // ......................................................................*.... - ldr q31, [x7, #-16] // .......................................................................*... - ldr q17, [x8], #32 // ........................................................................*.. - ldr q18, [x8, #-16] // .........................................................................*. - ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q16, [x2, #16] // *.......................................................................... - // ldr q30, [x1, #16] // ..*........................................................................ - // ldr q15, [x2], #32 // .*......................................................................... - // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... - // ldr q21, [x4], #32 // .......*................................................................... - // ldr q6, [x4, #-16] // ..................*........................................................ - // ldr q1, [x5], #32 // ...................*....................................................... - // ldr q12, [x5, #-16] // ......................*.................................................... - // ld1 {v24.8H}, [x6], #16 // .......................*................................................... - // ldr q19, [x7], #32 // ..................................*........................................ - // ldr q31, [x7, #-16] // ...................................*....................................... - // ldr q17, [x8], #32 // ....................................*...................................... - // ldr q18, [x8, #-16] // .......................................*................................... - // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. - // ldr q20, [x1], #32 // ......*.................................................................... - // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. - // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. - // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ - // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... - // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. - // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. - // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ - // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... - // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... - // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... - // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... - // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. - // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. - // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ - // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... - // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. - // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. - // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ - // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... - // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... - // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... - // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... - // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... - // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. - // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ - // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... - // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. - // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ - // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... - // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... - // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ - // ldr q16, [x2, #16] // ................................................................*.......... - // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... - // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... - // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... - // mul v20.8H, v20.8H, v2.8H // .......................................................*................... - // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. - // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ - // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... - // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. - // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ - // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... - // ldr q30, [x1, #16] // .................................................................*......... - // ldr q15, [x2], #32 // ...................................................*....................... - // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ - // ldr q21, [x4], #32 // ........................................................*.................. - // ldr q6, [x4, #-16] // .............................................................*............. - // ldr q1, [x5], #32 // ...................................................................*....... - // ldr q12, [x5, #-16] // ....................................................................*...... - // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - // ldr q19, [x7], #32 // ......................................................................*.... - // ldr q31, [x7, #-16] // .......................................................................*... - // ldr q17, [x8], #32 // ........................................................................*.. - // ldr q18, [x8, #-16] // .........................................................................*. - // ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - sub count, count, #2 -1: - // Instructions: 65 - // Expected cycles: 80 - // Expected IPC: 0.81 - - // Cycle bound: 80.0 - // IPC bound: 0.81 - - // Wall time: 11.64s - // User time: 11.64s - - // ---------------------- original position -----------------------> - // 0 25 50 - // |------------------------|------------------------|-------------- - ldr q20, [x1], #32 // *................................................................ - uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... - uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... - uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. - uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. - smull v30.4S, v8.4H, v15.4H // .............*................................................... - smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. - smull v11.4S, v8.4H, v7.4H // .........*....................................................... - smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... - smlal v30.4S, v20.4H, v7.4H // ...............*................................................. - smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ - smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... - smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... - uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. - uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ - uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... - uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ - smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... - smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... - smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. - smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. - smlal v11.4S, v20.4H, v24.4H // ............................*.................................... - smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... - smlal v30.4S, v20.4H, v16.4H // ................................*................................ - smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... - uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ - uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... - uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ - uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... - smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... - smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... - smlal v30.4S, v7.4H, v9.4H // ...............................................*................. - smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ - smlal v11.4S, v20.4H, v25.4H // .............................................*................... - smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. - smlal v30.4S, v20.4H, v16.4H // .................................................*............... - smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. - ldr q16, [x2, #16] // .....e........................................................... - uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. - uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ - mul v7.8H, v7.8H, v2.8H // ....................................................*............ - mul v20.8H, v20.8H, v2.8H // .........................................................*....... - zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. - zip1 v27.8H, v27.8H, v10.8H // .............................................................l... - smlal v11.4S, v7.4H, v0.4H // .....................................................*........... - smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... - smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... - smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... - str q27, [x0], #32 // ...............................................................l. - uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... - str q9, [x0, #-16] // ................................................................l - uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... - ldr q30, [x1, #16] // .e............................................................... - ldr q15, [x2], #32 // ....e............................................................ - ld1 {v9.8H}, [x3], #16 // ........e........................................................ - ldr q21, [x4], #32 // .................e............................................... - ldr q6, [x4, #-16] // ..................e.............................................. - ldr q1, [x5], #32 // .....................e........................................... - ldr q12, [x5, #-16] // ......................e.......................................... - ld1 {v24.8H}, [x6], #16 // .........................e....................................... - ldr q19, [x7], #32 // ..................................e.............................. - ldr q31, [x7, #-16] // ...................................e............................. - ldr q17, [x8], #32 // ......................................e.......................... - ldr q18, [x8, #-16] // .......................................e......................... - ld1 {v25.8H}, [x9], #16 // ..........................................e...................... - - // ---------------------------------------------------------------- new position -----------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ - // ldr q12, [x1], #32 // ............................*................................................................~.................................................. - // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. - // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. - // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. - // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ - // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. - // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... - // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... - // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... - // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... - // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. - // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ - // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ - // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. - // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... - // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. - // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ - // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. - // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... - // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. - // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ - // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. - // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... - // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... - // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. - // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. - // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ - // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... - // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... - // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... - // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. - // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... - // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... - // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... - // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... - // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. - // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... - // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ - // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. - // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l - - sub count, count, #1 - cbnz count, 1b - // Instructions: 55 - // Expected cycles: 61 - // Expected IPC: 0.90 - - // Cycle bound: 61.0 - // IPC bound: 0.90 - - // Wall time: 8.41s - // User time: 8.41s - - // ----------------- original position ------------------> - // 0 25 50 - // |------------------------|------------------------|---- - ldr q7, [x1], #32 // *...................................................... - uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... - uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... - uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. - smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. - smull v5.4S, v23.4H, v20.4H // .......*............................................... - smull2 v30.4S, v23.8H, v15.8H // ......*................................................ - uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... - smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... - smlal v5.4S, v11.4H, v9.4H // ...........*........................................... - uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... - smull v16.4S, v23.4H, v15.4H // .....*................................................. - smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... - smlal v5.4S, v3.4H, v28.4H // .................*..................................... - uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ - uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... - smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ - uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. - smlal v16.4S, v11.4H, v20.4H // .........*............................................. - smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ - smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ - uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... - uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ - smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. - smlal v16.4S, v3.4H, v20.4H // ...................*................................... - smlal v5.4S, v29.4H, v24.4H // .....................*................................. - uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... - smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. - smlal v16.4S, v29.4H, v28.4H // .......................*............................... - smlal v5.4S, v14.4H, v7.4H // .............................*......................... - smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... - smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... - smlal v16.4S, v14.4H, v9.4H // ...............................*....................... - smlal v5.4S, v21.4H, v25.4H // .................................*..................... - zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ - smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. - smlal v16.4S, v21.4H, v7.4H // ...................................*................... - uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. - str q20, [x0], #32 // ...............................................*....... - mul v15.8H, v7.8H, v2.8H // .......................................*............... - uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ - zip2 v31.8H, v27.8H, v10.8H // .........................................*............. - mul v20.8H, v7.8H, v2.8H // ........................................*.............. - smlal v5.4S, v15.4H, v0.4H // ...........................................*........... - smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... - str q31, [x0, #-16] // .................................................*..... - smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ - smlal v16.4S, v20.4H, v0.4H // .............................................*......... - uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... - uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... - zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. - zip2 v20.8H, v15.8H, v20.8H // ...................................................*... - str q7, [x0], #32 // .....................................................*. - str q20, [x0, #-16] // ......................................................* - - // -------------------- new position --------------------> - // 0 25 50 - // |------------------------|------------------------|---- - // ldr q20, [x1], #32 // *...................................................... - // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... - // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. - // smull v30.4S, v8.4H, v15.4H // ............*.......................................... - // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... - // smull v11.4S, v8.4H, v7.4H // ......*................................................ - // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. - // smlal v30.4S, v20.4H, v7.4H // ...................*................................... - // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. - // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ - // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. - // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... - // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. - // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ - // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ - // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... - // smlal v30.4S, v7.4H, v9.4H // .........................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. - // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ - // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... - // smlal v30.4S, v20.4H, v16.4H // .............................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... - // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... - // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... - // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... - // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... - // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ - // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. - // smlal v30.4S, v7.4H, v9.4H // .................................*..................... - // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... - // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... - // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... - // smlal v30.4S, v20.4H, v16.4H // .....................................*................. - // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. - // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ - // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. - // mul v7.8H, v7.8H, v2.8H // ........................................*.............. - // mul v20.8H, v20.8H, v2.8H // ...........................................*........... - // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ - // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... - // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... - // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... - // smlal v30.4S, v20.4H, v0.4H // ................................................*...... - // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... - // str q27, [x0], #32 // .......................................*............... - // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... - // str q9, [x0, #-16] // ..............................................*........ - // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... - // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. - // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... - // str q27, [x0], #32 // .....................................................*. - // str q9, [x0, #-16] // ......................................................* - - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) - // Instructions: 114 - // Expected cycles: 153 - // Expected IPC: 0.75 - // - // Cycle bound: 153.0 - // IPC bound: 0.75 - // - // Wall time: 0.69s - // User time: 0.69s - // - // ----------------------------------------------- original position -----------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - ldr q23, [x2, #16] // .*................................................................................................................ - ldr q19, [x2], #32 // *................................................................................................................. - ldr q17, [x5], #32 // ..*............................................................................................................... - uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... - uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... - ldr q23, [x5, #-16] // ...*.............................................................................................................. - ldr q30, [x1, #16] // .....*............................................................................................................ - uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. - uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... - ldr q17, [x1], #32 // ......*........................................................................................................... - ldr q10, [x7, #16] // .............*.................................................................................................... - uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... - uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ - smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... - smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... - smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ - smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... - smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. - smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. - ldr q19, [x4], #32 // ....................*............................................................................................. - ldr q16, [x4, #-16] // .....................*............................................................................................ - ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. - uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... - uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... - smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ - smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... - smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... - smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ - smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. - smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... - smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... - smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ - ldr q23, [x7], #32 // ......................*........................................................................................... - ldr q17, [x8, #16] // ..............*................................................................................................... - uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... - uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. - ldr q10, [x10], #32 // ...............*.................................................................................................. - ldr q16, [x10, #-16] // ................*................................................................................................. - ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ - uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... - uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. - ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ - ldr q3, [x11, #16] // ...........................*...................................................................................... - smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... - smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... - ldr q19, [x11], #32 // ............................*..................................................................................... - ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... - uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... - uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... - ldr q3, [x8], #32 // ..............................*................................................................................... - ldr q31, [x2], #32 // ......................................*........................................................................... - uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. - uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ - smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... - smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... - smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... - smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... - smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... - smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. - smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. - smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ - smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... - smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. - smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. - smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ - smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... - smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... - smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... - smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ - ldr q19, [x2, #-16] // .........................................*........................................................................ - uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... - uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. - mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... - uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. - uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. - mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ - smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ - smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... - ldr q23, [x5], #32 // .............................................*.................................................................... - smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... - uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... - smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... - ldr q17, [x5, #-16] // ................................................*................................................................. - ldr q13, [x1, #16] // ......................................................*........................................................... - uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. - uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... - uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. - ldr q23, [x1], #32 // ..........................................................................*....................................... - zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* - ldr q3, [x7, #16] // ........................................................................................*......................... - uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... - uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. - smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... - ldr q6, [x8, #16] // .........................................................................................*........................ - ldr q23, [x10], #32 // ..........................................................................................*....................... - smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... - ldr q17, [x10, #-16] // ...........................................................................................*...................... - ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... - uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... - uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... - ldr q23, [x4], #32 // ...............................................................................................*.................. - ldr q17, [x4, #-16] // ................................................................................................*................. - ldr q4, [x7], #32 // .................................................................................................*................ - uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... - uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. - uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ - smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... - ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. - ldr q25, [x11, #16] // ......................................................................................................*........... - ldr q29, [x11], #32 // .......................................................................................................*.......... - ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. - ldr q14, [x8], #32 // .........................................................................................................*........ - ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q3, [x2], #32 // .*................................................................................................................ - // ldr q17, [x2, #-16] // *................................................................................................................. - // ldr q21, [x5], #32 // ..*............................................................................................................... - // ldr q19, [x5, #-16] // .....*............................................................................................................ - // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... - // ldr q25, [x1, #16] // ......*........................................................................................................... - // ldr q22, [x1], #32 // .........*........................................................................................................ - // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... - // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... - // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... - // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. - // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. - // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... - // ldr q3, [x7, #16] // ..........*....................................................................................................... - // ldr q6, [x8, #16] // .................................*................................................................................ - // ldr q8, [x10], #32 // ....................................*............................................................................. - // ldr q26, [x10, #-16] // .....................................*............................................................................ - // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... - // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... - // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... - // ldr q8, [x4], #32 // ...................*.............................................................................................. - // ldr q26, [x4, #-16] // ....................*............................................................................................. - // ldr q4, [x7], #32 // ................................*................................................................................. - // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... - // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... - // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ - // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... - // ldr q25, [x11, #16] // ..........................................*....................................................................... - // ldr q29, [x11], #32 // .............................................*.................................................................... - // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... - // ldr q14, [x8], #32 // .................................................*................................................................ - // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ - // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ - // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. - // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... - // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. - // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. - // ldr q3, [x2], #32 // ..................................................*............................................................... - // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. - // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... - // ldr q17, [x2, #-16] // .....................................................................*............................................ - // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. - // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... - // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... - // ldr q21, [x5], #32 // ..............................................................................*................................... - // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... - // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... - // ldr q19, [x5, #-16] // ..................................................................................*............................... - // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... - // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ - // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. - // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. - // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. - // ldr q25, [x1, #16] // ...................................................................................*.............................. - // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... - // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... - // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. - // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ - // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... - // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... - // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... - // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ - // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... - // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... - // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... - // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... - // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... - // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. - // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. - // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ - // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... - // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. - // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. - // ldr q22, [x1], #32 // .......................................................................................*.......................... - // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... - // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ - // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... - // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... - // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... - // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ - // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... - // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... - // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... - // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... - // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... - // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. - // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... - // ldr q3, [x7, #16] // .........................................................................................*........................ - // ldr q6, [x8, #16] // .............................................................................................*.................... - // ldr q8, [x10], #32 // ..............................................................................................*................... - // ldr q26, [x10, #-16] // ................................................................................................*................. - // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ - // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... - // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. - // ldr q8, [x4], #32 // ....................................................................................................*............. - // ldr q26, [x4, #-16] // .....................................................................................................*............ - // ldr q4, [x7], #32 // ......................................................................................................*........... - // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... - // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... - // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... - // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ - // ldr q25, [x11, #16] // ............................................................................................................*..... - // ldr q29, [x11], #32 // .............................................................................................................*.... - // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... - // ldr q14, [x8], #32 // ................................................................................................................*. - // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. - // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. - // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ - // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* - // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. - // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... - - sub count, count, #2 -1: - // Instructions: 82 - // Expected cycles: 102 - // Expected IPC: 0.80 - // - // Cycle bound: 102.0 - // IPC bound: 0.80 - // - // Wall time: 15.93s - // User time: 15.93s - // - // ------------------------------- original position -------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------ - smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ - uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ - smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... - ldr q3, [x2], #32 // ....e............................................................................. - uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... - smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... - ldr q17, [x2, #-16] // .....e............................................................................ - smull v18.4S, v31.4H, v19.4H // .........*........................................................................ - smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... - smull v29.4S, v31.4H, v21.4H // .............*.................................................................... - ldr q21, [x5], #32 // .....................e............................................................ - smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... - smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. - ldr q19, [x5, #-16] // ......................e........................................................... - smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... - smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... - uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... - uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... - smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... - ldr q25, [x1, #16] // .e................................................................................ - smlal v29.4S, v26.4H, v28.4H // ................................*................................................. - smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... - uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ - smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... - smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. - smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. - smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... - smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... - smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... - smlal v29.4S, v4.4H, v31.4H // .................................................*................................ - smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... - smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... - smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ - smlal v29.4S, v30.4H, v1.4H // ................................................................*................. - smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... - smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. - smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. - smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... - smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... - ldr q22, [x1], #32 // e................................................................................. - uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ - uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... - mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... - uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... - uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. - uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... - smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... - smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... - uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... - uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. - zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. - mul v23.8H, v26.8H, v2.8H // .....................................................................*............ - uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... - smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... - str q14, [x0, #16] // .................................................................................l - ldr q3, [x7, #16] // ...................................e.............................................. - ldr q6, [x8, #16] // .......................................e.......................................... - ldr q8, [x10], #32 // ...................................................e.............................. - ldr q26, [x10, #-16] // ....................................................e............................. - ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... - uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ - uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... - ldr q8, [x4], #32 // .................e................................................................ - ldr q26, [x4, #-16] // ..................e............................................................... - ldr q4, [x7], #32 // ..................................e............................................... - uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. - uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. - ld1 {v8.8H}, [x6], #16 // .........................e........................................................ - uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. - ldr q25, [x11, #16] // ........................................................e......................... - ldr q29, [x11], #32 // .......................................................e.......................... - ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... - ldr q14, [x8], #32 // ......................................e........................................... - smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. - smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... - smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... - ld1 {v23.8H}, [x3], #16 // ........e......................................................................... - smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. - uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ - str q5, [x0], #32 // ................................................................................l. - zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... - - // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- - // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... - // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... - // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. - // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ - // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... - // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... - // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... - // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. - // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... - // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... - // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... - // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... - // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. - // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. - // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. - // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... - // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... - // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... - // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. - // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. - // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ - // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... - // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... - // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ - // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ - // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ - // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... - // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... - // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ - // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ - // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ - // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... - // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... - // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... - // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... - // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... - // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ - // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... - // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... - // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... - // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... - // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... - // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... - // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... - // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... - // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. - // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ - // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... - // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. - // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... - // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. - // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... - // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ - // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... - // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ - // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... - // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. - // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... - // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... - // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. - // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ - // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... - // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. - // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. - // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ - // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ - // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. - // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l - // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... - - sub count, count, #1 - cbnz count, 1b - // Instructions: 50 - // Expected cycles: 56 - // Expected IPC: 0.89 - // - // Cycle bound: 56.0 - // IPC bound: 0.89 - // - // Wall time: 4.16s - // User time: 4.16s - // - // --------------- original position ---------------> - // 0 25 - // |------------------------| - smull2 v17.4S, v31.8H, v19.8H // ..*............................................... - uzp2 v1.8H, v14.8H, v6.8H // ................*................................. - smull v18.4S, v31.4H, v21.4H // .......*.......................................... - smlal2 v24.4S, v26.8H, v28.8H // *................................................. - smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. - smull v21.4S, v31.4H, v19.4H // .....*............................................ - smlal v18.4S, v16.4H, v19.4H // .........*........................................ - uzp2 v31.8H, v4.8H, v3.8H // .*................................................ - uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... - smlal v21.4S, v16.4H, v23.4H // ..........*....................................... - smlal v18.4S, v20.4H, v27.4H // ...........*...................................... - uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. - smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... - smlal v21.4S, v20.4H, v28.4H // .............*.................................... - smlal v18.4S, v26.4H, v28.4H // ..............*................................... - smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... - smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... - smlal v21.4S, v26.4H, v8.4H // ...............*.................................. - smlal v18.4S, v9.4H, v1.4H // ...................*.............................. - smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... - smlal2 v17.4S, v9.8H, v3.8H // .................*................................ - smlal v21.4S, v9.4H, v3.4H // ....................*............................. - smlal v18.4S, v31.4H, v3.4H // .......................*.......................... - smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... - smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ - smlal v21.4S, v31.4H, v12.4H // ........................*......................... - smlal v18.4S, v30.4H, v14.4H // ...........................*...................... - smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... - smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ - smlal v21.4S, v30.4H, v10.4H // ............................*..................... - smlal v18.4S, v11.4H, v10.4H // ...............................*.................. - zip2 v19.8H, v7.8H, v15.8H // ......................................*........... - smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... - smlal v21.4S, v11.4H, v22.4H // ................................*................. - uzp1 v23.8H, v18.8H, v24.8H // .................................*................ - str q19, [x0, #16] // .........................................*........ - mul v19.8H, v23.8H, v2.8H // ..................................*............... - uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ - str q5, [x0], #32 // .............................................*.... - mul v26.8H, v23.8H, v2.8H // .......................................*.......... - smlal v18.4S, v19.4H, v0.4H // ...................................*.............. - smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. - smlal v21.4S, v26.4H, v0.4H // ...........................................*...... - smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... - uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... - uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... - zip1 v23.8H, v19.8H, v13.8H // ..............................................*... - zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. - str q23, [x0], #32 // .................................................* - str q19, [x0, #-16] // ................................................*. - - // ----------------- new position ------------------> - // 0 25 - // |------------------------|------------------------ - // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. - // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... - // smull2 v13.4S, v31.8H, v19.8H // *................................................. - // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... - // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. - // smull v18.4S, v31.4H, v19.4H // .....*............................................ - // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... - // smull v29.4S, v31.4H, v21.4H // ..*............................................... - // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. - // smlal v29.4S, v16.4H, v19.4H // ......*........................................... - // smlal v18.4S, v16.4H, v23.4H // .........*........................................ - // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... - // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... - // smlal v18.4S, v20.4H, v28.4H // .............*.................................... - // smlal v29.4S, v26.4H, v28.4H // ..............*................................... - // smlal v18.4S, v26.4H, v8.4H // .................*................................ - // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ - // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. - // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. - // smlal v29.4S, v9.4H, v26.4H // ..................*............................... - // smlal v18.4S, v9.4H, v31.4H // .....................*............................ - // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... - // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. - // smlal v29.4S, v4.4H, v31.4H // ......................*........................... - // smlal v18.4S, v4.4H, v12.4H // .........................*........................ - // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... - // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... - // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... - // smlal v18.4S, v30.4H, v10.4H // .............................*.................... - // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. - // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... - // smlal v29.4S, v11.4H, v10.4H // ..............................*................... - // smlal v18.4S, v11.4H, v22.4H // .................................*................ - // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... - // mul v19.8H, v31.8H, v2.8H // ....................................*............. - // smlal v29.4S, v19.4H, v0.4H // ........................................*......... - // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ - // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ - // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. - // mul v23.8H, v26.8H, v2.8H // .......................................*.......... - // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... - // str q14, [x0, #16] // ...................................*.............. - // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... - // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... - // str q5, [x0], #32 // ......................................*........... - // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... - // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. - // str q14, [x0, #16] // .................................................* - // str q5, [x0], #32 // ................................................*. - - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq wtmp - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 8302d2a3e..f2451815a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -19,8 +19,8 @@ * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite @@ -112,9 +112,9 @@ mlkem_q .req v30 bits .req v31 -.text -.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) + .balign 4 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack @@ -402,5 +402,5 @@ return: .unreq mlkem_q .unreq bits -#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ +/* simpasm: footer-start */ +#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c index becdf303b..592c15fb0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/aarch64/src/rej_uniform_table.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_aarch64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_ref/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S index 5fdc3d0a0..3063d20ae 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/basemul.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" @@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(basemul_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp @@ -133,4 +135,5 @@ schoolbook 3 mov %r8,%rsp ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S index 7b1f22624..e74199930 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/intt.S @@ -12,6 +12,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(invntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -252,4 +254,5 @@ intt_level6 0 intt_level6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S index 5d928b4cc..70582fbc1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntt.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(ntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -216,4 +218,5 @@ levels1t6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S new file mode 100644 index 000000000..71f2af000 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttfrombytes.S @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S new file mode 100644 index 000000000..4c10ef366 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttpack.S @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttpack_avx2): +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S new file mode 100644 index 000000000..4f0b01e83 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/ntttobytes.S @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S new file mode 100644 index 000000000..0cf45c671 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/nttunpack.S @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttunpack_avx2): +call nttunpack128_avx2 +add $256,%rdi +call nttunpack128_avx2 +ret + +nttunpack128_avx2: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S new file mode 100644 index 000000000..78bad0559 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/reduce.S @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation based on Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +// Changes: +// - Add call to csub in reduce128_avx to produce outputs +// in [0,1,...,q-1] rather than [0,1,...,q], matching the +// semantics of poly_reduce(). + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" + +.text +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(reduce_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 +call reduce128_avx2 +add $256,%rdi +call reduce128_avx2 +ret + +reduce128_avx2: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 + +csubq 2 +csubq 3 +csubq 4 +csubq 5 +csubq 6 +csubq 7 +csubq 8 +csubq 9 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S deleted file mode 100644 index 9bcd04896..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/shuffle.S +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// Implementation from Kyber reference repository -// https://github.com/pq-crystals/kyber/blob/main/avx2 - -#include "../../../common.h" - -#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -#include "consts.h" -#include "fq.inc" -#include "shuffle.inc" - -.global MLKEM_ASM_NAMESPACE(nttpack_avx2) -MLKEM_ASM_NAMESPACE(nttpack_avx2): -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -nttunpack128_avx2: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) -MLKEM_ASM_NAMESPACE(nttunpack_avx2): -call nttunpack128_avx2 -add $256,%rdi -call nttunpack128_avx2 -ret - -ntttobytes128_avx: -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) -MLKEM_ASM_NAMESPACE(ntttobytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 -call ntttobytes128_avx -add $256,%rsi -add $192,%rdi -call ntttobytes128_avx -ret - -nttfrombytes128_avx: -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) -MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 -call nttfrombytes128_avx -add $256,%rdi -add $192,%rsi -call nttfrombytes128_avx -ret - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S similarity index 64% rename from src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S index 3f013a5fa..7774cec0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/tomont.S @@ -14,63 +14,24 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "consts.h" +/* simpasm: header-end */ +#include "consts.h" #include "fq.inc" .text -reduce128_avx2: -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 -red16 3 -red16 4 -red16 5 -red16 6 -red16 7 -red16 8 -red16 9 - -csubq 2 -csubq 3 -csubq 4 -csubq 5 -csubq 6 -csubq 7 -csubq 8 -csubq 9 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(reduce_avx2) -MLKEM_ASM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 -call reduce128_avx2 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx2 add $256,%rdi -call reduce128_avx2 +call tomont128_avx2 ret - tomont128_avx2: #load vmovdqa (%rdi),%ymm3 @@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_ASM_NAMESPACE(tomont_avx2) -MLKEM_ASM_NAMESPACE(tomont_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 -call tomont128_avx2 -add $256,%rdi -call tomont128_avx2 -ret - +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md index e499a4a22..a420f05b6 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/README.md @@ -10,10 +10,9 @@ works: - _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303) -## Profiles -This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to -read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is -automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the +## Variants + +This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported -by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh). +by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh). diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h deleted file mode 100644 index f124702a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/clean.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_H - -/* Identifier for this backend so that source and assembly files - * in the build can be appropriately guarded. */ -#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN - -#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN - -/* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h index a7217163f..4a0243279 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/opt.h @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -/* ML-KEM arithmetic native profile for clean assembly */ - #ifdef MLKEM_NATIVE_ARITH_PROFILE_H #error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? #else diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c index 2c1bb31e1..23e7949d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/aarch64_zetas.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h index ed0825892..60779598d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; extern const uint8_t rej_uniform_table[]; -#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) -void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt) void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean) -void intt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt) void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) -unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, - const uint8_t *table); - -#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) -void poly_reduce_asm_clean(int16_t *); - #define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt) void poly_reduce_asm_opt(int16_t *); -#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean) -void poly_tomont_asm_clean(int16_t *); - #define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt) void poly_tomont_asm_opt(int16_t *); -#define poly_mulcache_compute_asm_clean \ - MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *, - const int16_t *, const int16_t *); - - #define poly_mulcache_compute_asm_opt \ MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *, const int16_t *); -#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean) -void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a); - #define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt) void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a); -#define polyvec_basemul_acc_montgomery_cached_asm_clean \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) -void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r, - const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); -#define polyvec_basemul_acc_montgomery_cached_asm_opt \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) -void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) +unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, + const uint8_t *table); #endif /* MLKEM_AARCH64_NATIVE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h deleted file mode 100644 index 4be90fb24..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/clean_impl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H - -#include "arith_native_aarch64.h" - -/* Set of primitives that this backend replaces */ -#define MLKEM_USE_NATIVE_NTT -#define MLKEM_USE_NATIVE_INTT -#define MLKEM_USE_NATIVE_POLY_REDUCE -#define MLKEM_USE_NATIVE_POLY_TOMONT -#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE -#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED -#define MLKEM_USE_NATIVE_POLY_TOBYTES -#define MLKEM_USE_NATIVE_REJ_UNIFORM - -static INLINE void ntt_native(int16_t data[MLKEM_N]) -{ - ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); -} - -static INLINE void intt_native(int16_t data[MLKEM_N]) -{ - intt_asm_clean(data, aarch64_invntt_zetas_layer01234, - aarch64_invntt_zetas_layer56); -} - -static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) -{ - poly_reduce_asm_clean(data); -} - -static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) -{ - poly_tomont_asm_clean(data); -} - -static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], - const int16_t y[MLKEM_N]) -{ - poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, - aarch64_zetas_mulcache_twisted_native); -} - -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], - const int16_t b[MLKEM_K * MLKEM_N], - const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) -{ - polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); -} - -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const int16_t a[MLKEM_N]) -{ - poly_tobytes_asm_clean(r, a); -} - -static INLINE int rej_uniform_native(int16_t *r, unsigned len, - const uint8_t *buf, unsigned buflen) -{ - if (len != MLKEM_N || buflen % 24 != 0) - { - return -1; - } - return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); -} - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S deleted file mode 100644 index b0ae1ad46..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_clean.S +++ /dev/null @@ -1,389 +0,0 @@ -/// Copyright (c) 2024 The mlkem-native project authors -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro gs_butterfly a, b, root, idx0, idx1 - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmodq \b, tmp, \root, \idx0, \idx1 -.endm - -.macro gs_butterfly_v a, b, root, root_twisted - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmod \b, tmp, \root, \root_twisted -.endm - -.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 - mulmod \dst0, \src0, ninv, ninv_tw - mulmod \dst1, \src1, ninv, ninv_tw - mulmod \dst2, \src2, ninv, ninv_tw - mulmod \dst3, \src3, ninv, ninv_tw -.endm - -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - -// For comparability reasons, the output range for the coefficients of this -// invNTT code is supposed to match the implementation from PQClean on commit -// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients -// are NOT canonically reduced. The ordering of the coefficients is canonical, -// also matching PQClean. - -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_clean) - - in .req x0 - r01234_ptr .req x1 - r56_ptr .req x2 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - consts .req v7 - q_consts .req q7 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - ninv .req v29 - ninv_tw .req v30 - -.balign 4 -MLKEM_ASM_NAMESPACE(intt_asm_clean): - push_stack - - // Setup constants - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - mov wtmp, #512 - dup ninv.8h, wtmp - mov wtmp, #5040 - dup ninv_tw.8h, wtmp - - mov inp, in - mov count, #8 - -scale_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - // Bounds: Absolute value < q - - str q_data0, [inp], #64 - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, scale_start - - mov inp, in - mov count, #8 - - .p2align 2 -layer3456_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - transpose4 data // manual ld4 - - load_next_roots_56 - - // Layer 7 - gs_butterfly_v data0, data1, root1, root1_tw - gs_butterfly_v data2, data3, root2, root2_tw - // Bounds: - // data0, data2: < 2q - // data1, data3: < q - - // Layer 6 - gs_butterfly_v data0, data2, root0, root0_tw - gs_butterfly_v data1, data3, root0, root0_tw - // Bounds: - // data0: < 4q - // data1: < 2q - // data2, data3: < q - - transpose4 data - - load_next_roots_34 - - // Layer 5 - gs_butterfly data0, data1, root0, 2, 3 - gs_butterfly data2, data3, root0, 4, 5 - // Max bound: 8q - - // Not all of those reductions are needed, but the bounds tracking - // is easier if we uniformly reduce at this point. - barrett_reduce data0 - barrett_reduce data2 - barrett_reduce data1 - barrett_reduce data3 - - // Bounds: q/2 - - // Layer 4 - gs_butterfly data0, data2, root0, 0, 1 - gs_butterfly data1, data3, root0, 0, 1 - // Bounds: < q - - str q_data0, [inp], #(64) - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, layer3456_start - - // --------------------------------------------------------------------- - - mov count, #4 - load_roots_012 - - .p2align 2 - -layer012_start: - - ldr q_data0, [in, #0] - ldr q_data1, [in, #(1*(512/8))] - ldr q_data2, [in, #(2*(512/8))] - ldr q_data3, [in, #(3*(512/8))] - ldr q_data4, [in, #(4*(512/8))] - ldr q_data5, [in, #(5*(512/8))] - ldr q_data6, [in, #(6*(512/8))] - ldr q_data7, [in, #(7*(512/8))] - - gs_butterfly data0, data1, root0, 6, 7 - gs_butterfly data2, data3, root1, 0, 1 - gs_butterfly data4, data5, root1, 2, 3 - gs_butterfly data6, data7, root1, 4, 5 - - gs_butterfly data0, data2, root0, 2, 3 - gs_butterfly data1, data3, root0, 2, 3 - gs_butterfly data4, data6, root0, 4, 5 - gs_butterfly data5, data7, root0, 4, 5 - - gs_butterfly data0, data4, root0, 0, 1 - gs_butterfly data1, data5, root0, 0, 1 - gs_butterfly data2, data6, root0, 0, 1 - gs_butterfly data3, data7, root0, 0, 1 - - // Bounds: < 8q - - str q_data4, [in, #(4*(512/8))] - str q_data5, [in, #(5*(512/8))] - str q_data6, [in, #(6*(512/8))] - str q_data7, [in, #(7*(512/8))] - - str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(512/8))] - str q_data2, [in, #(-16 + 2*(512/8))] - str q_data3, [in, #(-16 + 3*(512/8))] - - subs count, count, #1 - cbnz count, layer012_start - - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq in - .unreq r01234_ptr - .unreq r56_ptr - .unreq inp - .unreq count - .unreq wtmp - .unreq data0 - .unreq data1 - .unreq data2 - .unreq data3 - .unreq data4 - .unreq data5 - .unreq data6 - .unreq data7 - .unreq q_data0 - .unreq q_data1 - .unreq q_data2 - .unreq q_data3 - .unreq q_data4 - .unreq q_data5 - .unreq q_data6 - .unreq q_data7 - .unreq root0 - .unreq root1 - .unreq root2 - .unreq root0_tw - .unreq root1_tw - .unreq root2_tw - .unreq consts - .unreq q_consts - .unreq q_root0 - .unreq q_root1 - .unreq q_root2 - .unreq q_root0_tw - .unreq q_root1_tw - .unreq q_root2_tw - .unreq tmp - .unreq t0 - .unreq t1 - .unreq t2 - .unreq t3 - .unreq ninv - .unreq ninv_tw - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S index 191de3c4d..0f9e44307 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/intt_opt.S @@ -25,6 +25,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -139,9 +140,6 @@ // are NOT canonically reduced. The ordering of the coefficients is canonical, // also matching PQClean. -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_opt) - in .req x0 r01234_ptr .req x1 r56_ptr .req x2 @@ -194,7 +192,9 @@ ninv .req v29 ninv_tw .req v30 -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) + .balign 4 MLKEM_ASM_NAMESPACE(intt_asm_opt): push_stack @@ -1042,4 +1042,5 @@ layer012_start: .unreq ninv .unreq ninv_tw +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S deleted file mode 100644 index 4f844e212..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/ntt_clean.S +++ /dev/null @@ -1,317 +0,0 @@ -/// -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// Copyright (c) 2024 The mlkem-native project authors -// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro ct_butterfly a, b, root, idx0, idx1 - mulmodq tmp, \b, \root, \idx0, \idx1 - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro ct_butterfly_v a, b, root, root_twisted - mulmod tmp, \b, \root, \root_twisted - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - // Arguments - in .req x0 // Input/output buffer - r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4 - r56_ptr .req x2 // twiddles for layer 5,6 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - consts .req v7 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - .text - .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) - - .balign 4 -MLKEM_ASM_NAMESPACE(ntt_asm_clean): - push_stack - - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - - mov inp, in - mov count, #4 - - load_roots_012 - - .p2align 2 - - // Bounds reasoning: - // - There are 7 layers - // - When passing from layer N to layer N+1, each layer-N value - // is modified through the addition/subtraction of a Montgomery - // product of a twiddle of absolute value < q/2 and a layer-N value. - // - Recalling that for C such that |a| < C * q and |t|> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_clean_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 -poly_tomont_asm_loop: - - ldr q_data, [src], #64 - mulmod res, data, factor, factor_t - str q_res, [src, #-64] - - ldr q_data, [src, #-48] - mulmod res, data, factor, factor_t - str q_res, [src, #-48] - - ldr q_data, [src, #-32] - mulmod res, data, factor, factor_t - str q_res, [src, #-32] - - ldr q_data, [src, #-16] - mulmod res, data, factor, factor_t - str q_res, [src, #-16] - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S new file mode 100644 index 000000000..a3593b7fd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/******************************************** + * poly_mulcache_compute() * + ********************************************/ + + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + wtmp .req w5 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + modulus_twisted .req v7 + + .text + .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #20159 + dup modulus_twisted.8h, wtmp + + mov count, #16 + // Instructions: 7 + // Expected cycles: 12 + // Expected IPC: 0.58 + + // Cycle bound: 12.0 + // IPC bound: 0.58 + + // Wall time: 0.01s + // User time: 0.01s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... + + sub count, count, #1 +poly_mulcache_compute_asm_opt_loop: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 + + // Cycle bound: 13.0 + // IPC bound: 0.69 + + // Wall time: 0.09s + // User time: 0.09s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, poly_mulcache_compute_asm_opt_loop + // Instructions: 2 + // Expected cycles: 5 + // Expected IPC: 0.40 + + // Cycle bound: 5.0 + // IPC bound: 0.40 + + // Wall time: 0.00s + // User time: 0.00s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... + + + ret + + .unreq cache_ptr + .unreq data_ptr + .unreq zeta_ptr + .unreq zeta_twisted_ptr + .unreq count + .unreq wtmp + + .unreq data_odd + .unreq zeta + .unreq q_zeta + .unreq zeta_twisted + .unreq q_zeta_twisted + + .unreq tmp0 + .unreq q_tmp0 + .unreq tmp1 + .unreq q_tmp1 + .unreq dst + .unreq q_dst + + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S deleted file mode 100644 index 79605818f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_opt.S +++ /dev/null @@ -1,670 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -/* - * Some modular arithmetic macros - */ - -/* Barrett reduction */ -.macro barrett_reduce a - sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] - srshr tmp.8h, tmp.8h, #11 - mls \a\().8h, tmp.8h, modulus.h[0] -.endm - -/* Montgomery multiplication, with precomputed Montgomery twist - * Expects modulus in consts.h[0]. */ -.macro mulmod dst, src, const, const_twisted - sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, modulus.h[0] -.endm - -/* Turns signed-canonical to unsigned canonical representative - * through conditional addition of the modulus. - * - * Expected modulus in `modulus`. */ -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h -.endm - -/********************************** - * poly_reduce() * - **********************************/ - -.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) - - ptr .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - - tmp .req v1 - mask .req v2 - modulus .req v3 - modulus_twisted .req v4 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 - - // Cycle bound: 22.0 - // IPC bound: 0.68 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #32] // *............................. - ldr q23, [x0, #48] // ..*........................... - sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... - sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... - srshr v7.8H, v7.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v7.8H, v3.H[0] // ...........*.................. - mls v23.8H, v30.8H, v3.H[0] // .............*................ - ldr q5, [x0, #16] // ..............*............... - sshr v7.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v23.8H, #15 // .................*............ - and v7.16B, v3.16B, v7.16B // ..................*........... - add v21.8H, v21.8H, v7.8H // ...................*.......... - and v7.16B, v3.16B, v30.16B // ....................*......... - add v16.8H, v23.8H, v7.8H // .....................*........ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x0, #32] // *.............................. - // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... - // ldr q2, [x0, #48] // ..*............................ - // srshr v19.8H, v22.8H, #11 // ........*...................... - // mls v30.8H, v19.8H, v3.H[0] // ...........*................... - // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ - // sshr v31.8H, v30.8H, #15 // ................*.............. - // srshr v25.8H, v25.8H, #11 // ..........*.................... - // and v18.16B, v3.16B, v31.16B // ..................*............ - // mls v2.8H, v25.8H, v3.H[0] // .............*................. - // add v21.8H, v30.8H, v18.8H // ...................*........... - // ldr q5, [x0, #16] // ..............*................ - // sshr v18.8H, v2.8H, #15 // .................*............. - // and v27.16B, v3.16B, v18.16B // ....................*.......... - // add v16.8H, v2.8H, v27.8H // .....................*......... - - sub count, count, #1 -1: - // Instructions: 32 - // Expected cycles: 36 - // Expected IPC: 0.89 - - // Cycle bound: 36.0 - // IPC bound: 0.89 - - // Wall time: 1.05s - // User time: 1.05s - - // -------- cycle (expected) ---------> - // 0 25 - // |------------------------|---------- - ldr q6, [x0], #64 // *................................... - ldr q30, [x0, #32] // ..e................................. - sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... - sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. - sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. - str q16, [x0, #-16] // .......*............................ - srshr v20.8H, v31.8H, #11 // ........*........................... - srshr v28.8H, v29.8H, #11 // .........*.......................... - str q21, [x0, #-32] // ..........*......................... - mls v6.8H, v20.8H, v3.H[0] // ...........*........................ - mls v5.8H, v28.8H, v3.H[0] // ............*....................... - ldr q2, [x0, #48] // .............e...................... - sshr v31.8H, v6.8H, #15 // ...............*.................... - srshr v19.8H, v22.8H, #11 // ................e................... - and v22.16B, v3.16B, v31.16B // .................*.................. - add v0.8H, v6.8H, v22.8H // ..................*................. - mls v30.8H, v19.8H, v3.H[0] // ...................e................ - sshr v26.8H, v5.8H, #15 // ....................*............... - sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. - and v17.16B, v3.16B, v26.16B // ......................*............. - add v1.8H, v5.8H, v17.8H // .......................*............ - sshr v31.8H, v30.8H, #15 // ........................e........... - srshr v25.8H, v25.8H, #11 // .........................e.......... - str q1, [x0, #-48] // ..........................*......... - and v18.16B, v3.16B, v31.16B // ...........................e........ - mls v2.8H, v25.8H, v3.H[0] // ............................e....... - add v21.8H, v30.8H, v18.8H // .............................e...... - ldr q5, [x0, #16] // ..............................e..... - sshr v18.8H, v2.8H, #15 // ................................e... - str q0, [x0, #-64] // .................................*.. - and v27.16B, v3.16B, v18.16B // ..................................e. - add v16.8H, v2.8H, v27.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ..................................*................................. - // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. - // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... - // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... - // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. - // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ............................e.....'.............................~... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ - // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ - // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... - // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. - // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... - // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... - // str q0, [x0, #-48] // ........................~.........'.........................*....... - // ldr q0, [x0, #-32] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. - // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. - // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... - // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... - // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... - // str q0, [x0, #-32] // ........~.........................'.........*....................... - // ldr q0, [x0, #-16] // ...........e......................'............~.................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... - // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v2.16b, v3.16b, v2.16b // ................................e.'................................. - // add v0.8h, v0.8h, v2.8h // .................................e'................................. - // str q0, [x0, #-16] // .....~............................'......*.......................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 17 - // Expected cycles: 23 - // Expected IPC: 0.74 - - // Cycle bound: 23.0 - // IPC bound: 0.74 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. - ldr q24, [x0], #64 // .*............................ - str q21, [x0, #-32] // ...*.......................... - srshr v20.8H, v20.8H, #11 // ....*......................... - sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ - str q16, [x0, #-16] // ......*....................... - mls v5.8H, v20.8H, v3.H[0] // .......*...................... - srshr v20.8H, v25.8H, #11 // .........*.................... - sshr v2.8H, v5.8H, #15 // ...........*.................. - mls v24.8H, v20.8H, v3.H[0] // ............*................. - and v20.16B, v3.16B, v2.16B // .............*................ - add v31.8H, v5.8H, v20.8H // ..............*............... - sshr v20.8H, v24.8H, #15 // ................*............. - str q31, [x0, #-48] // .................*............ - and v31.16B, v3.16B, v20.16B // ..................*........... - add v24.8H, v24.8H, v31.8H // ...................*.......... - str q24, [x0, #-64] // ......................*....... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q6, [x0], #64 // .*............................. - // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... - // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. - // str q16, [x0, #-16] // ......*........................ - // srshr v20.8H, v31.8H, #11 // .........*..................... - // srshr v28.8H, v29.8H, #11 // ....*.......................... - // str q21, [x0, #-32] // ...*........................... - // mls v6.8H, v20.8H, v3.H[0] // ............*.................. - // mls v5.8H, v28.8H, v3.H[0] // .......*....................... - // sshr v31.8H, v6.8H, #15 // ................*.............. - // and v22.16B, v3.16B, v31.16B // ..................*............ - // add v0.8H, v6.8H, v22.8H // ...................*........... - // sshr v26.8H, v5.8H, #15 // ...........*................... - // and v17.16B, v3.16B, v26.16B // .............*................. - // add v1.8H, v5.8H, v17.8H // ..............*................ - // str q1, [x0, #-48] // .................*............. - // str q0, [x0, #-64] // ......................*........ - - - ret - - .unreq ptr - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - - .unreq tmp - .unreq mask - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_mulcache_compute() * - ********************************************/ - -.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - count .req x4 - wtmp .req w5 - - data_odd .req v0 - zeta .req v1 - q_zeta .req q1 - zeta_twisted .req v2 - q_zeta_twisted .req q2 - - tmp0 .req v3 - q_tmp0 .req q3 - tmp1 .req v4 - q_tmp1 .req q4 - dst .req v5 - q_dst .req q5 - - modulus .req v6 - modulus_twisted .req v7 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #20159 - dup modulus_twisted.8h, wtmp - - mov count, #16 - // Instructions: 7 - // Expected cycles: 12 - // Expected IPC: 0.58 - - // Cycle bound: 12.0 - // IPC bound: 0.58 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q1, [x1, #16] // *............................. - ldr q27, [x1], #32 // ..*........................... - ldr q23, [x2], #16 // ....*......................... - uzp2 v27.8H, v27.8H, v1.8H // ......*....................... - ldr q1, [x3], #16 // .......*...................... - mul v2.8H, v27.8H, v23.8H // .........*.................... - sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q29, [x1, #16] // *.............................. - // ldr q21, [x2], #16 // ....*.......................... - // ldr q27, [x1], #32 // ..*............................ - // ldr q7, [x3], #16 // .......*....................... - // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ - // mul v2.8H, v28.8H, v21.8H // .........*..................... - // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... - - sub count, count, #1 -1: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 - - // Cycle bound: 13.0 - // IPC bound: 0.69 - - // Wall time: 0.09s - // User time: 0.09s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q29, [x1, #16] // e............................. - ldr q21, [x2], #16 // ..e........................... - mls v2.8H, v27.8H, v6.H[0] // ....*......................... - ldr q27, [x1], #32 // .....e........................ - ldr q7, [x3], #16 // .......e...................... - uzp2 v28.8H, v27.8H, v29.8H // .........e.................... - str q2, [x0], #16 // ..........*................... - mul v2.8H, v28.8H, v21.8H // ...........e.................. - sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q3, [x1], #32 // .....e.......'....~.......'.... - // ldr q4, [x1, #-16] // e............~............~.... - // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. - // ldr q2, [x3], #16 // .......e.....'......~.....'.... - // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... - // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... - // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... - // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... - // str q5, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 2 - // Expected cycles: 5 - // Expected IPC: 0.40 - - // Cycle bound: 5.0 - // IPC bound: 0.40 - - // Wall time: 0.00s - // User time: 0.00s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v2.8H, v27.8H, v6.H[0] // *............................. - str q2, [x0], #16 // ....*......................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v2.8H, v27.8H, v6.H[0] // *.............................. - // str q2, [x0], #16 // ....*.......................... - - - ret - - .unreq cache_ptr - .unreq data_ptr - .unreq zeta_ptr - .unreq zeta_twisted_ptr - .unreq count - .unreq wtmp - - .unreq data_odd - .unreq zeta - .unreq q_zeta - .unreq zeta_twisted - .unreq q_zeta_twisted - - .unreq tmp0 - .unreq q_tmp0 - .unreq tmp1 - .unreq q_tmp1 - .unreq dst - .unreq q_dst - - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_tobytes() * - ********************************************/ -.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) - - data0 .req v0 - data1 .req v1 - out0 .req v2 - out1 .req v3 - out2 .req v4 - tmp .req v5 - - dst .req x0 - src .req x1 - count .req x2 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): - - mov count, #16 -poly_tobytes_asm_opt_asm_loop_start: - ld2 {data0.8h, data1.8h}, [src], #32 - - // r[3 * i + 0] = (t0 >> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_opt_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 - // Instructions: 5 - // Expected cycles: 7 - // Expected IPC: 0.71 - // - // Cycle bound: 7.0 - // IPC bound: 0.71 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q26, [x0, #48] // *............................. - ldr q23, [x0, #16] // ..*........................... - mul v17.8H, v26.8H, v2.8H // ....*......................... - sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ - ldr q27, [x0, #32] // ......*....................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q7, [x0, #48] // *.............................. - // ldr q23, [x0, #16] // ..*............................ - // mul v17.8H, v7.8H, v2.8H // ....*.......................... - // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... - // ldr q27, [x0, #32] // ......*........................ - - sub count, count, #1 -1: - // Instructions: 20 - // Expected cycles: 24 - // Expected IPC: 0.83 - // - // Cycle bound: 24.0 - // IPC bound: 0.83 - // - // Wall time: 0.73s - // User time: 0.73s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ - ldr q7, [x0], #64 // ..*........................... - str q17, [x0, #-16] // ....*......................... - sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ - sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... - mul v25.8H, v23.8H, v2.8H // .......*...................... - mul v0.8H, v7.8H, v2.8H // ........*..................... - mul v26.8H, v27.8H, v2.8H // .........*.................... - ldr q7, [x0, #48] // ..........e................... - mls v25.8H, v5.8H, v4.H[0] // ............*................. - ldr q23, [x0, #16] // .............e................ - mls v26.8H, v29.8H, v4.H[0] // ...............*.............. - mls v0.8H, v19.8H, v4.H[0] // ................*............. - str q25, [x0, #-48] // .................*............ - mul v17.8H, v7.8H, v2.8H // ..................e........... - sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... - str q0, [x0, #-64] // ....................*......... - ldr q27, [x0, #32] // .....................e........ - str q26, [x0, #-32] // .......................*...... - - // --------- cycle (expected) ----------> - // 0 25 - // |------------------------|------------ - // ldr q0, [x0], #64 // ..............'.*..................... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. - // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... - // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... - // str q1, [x0, #-64] // ..........~...'...................*... - // ldr q0, [x0, #-48] // ...e..........'............~.......... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... - // mul v1.8h, v0.8h, v2.8h // ..............'......*................ - // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... - // str q1, [x0, #-48] // .......~......'................*...... - // ldr q0, [x0, #-32] // ...........e..'....................~.. - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. - // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. - // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ - // str q1, [x0, #-32] // .............~'......................* - // ldr q0, [x0, #-16] // e.............'.........~............. - // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... - // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... - // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... - // str q1, [x0, #-16] // ..............'...*................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 15 - // Expected cycles: 18 - // Expected IPC: 0.83 - // - // Cycle bound: 18.0 - // IPC bound: 0.83 - // - // Wall time: 0.07s - // User time: 0.07s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ - mul v26.8H, v23.8H, v2.8H // ..*........................... - sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... - ldr q23, [x0], #64 // ....*......................... - mul v27.8H, v27.8H, v2.8H // ......*....................... - mls v26.8H, v7.8H, v4.H[0] // .......*...................... - sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... - mul v23.8H, v23.8H, v2.8H // .........*.................... - str q17, [x0, #-16] // ..........*................... - mls v27.8H, v25.8H, v4.H[0] // ...........*.................. - str q26, [x0, #-48] // ............*................. - mls v23.8H, v7.8H, v4.H[0] // .............*................ - str q27, [x0, #-32] // ...............*.............. - str q23, [x0, #-64] // .................*............ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v17.8H, v7.8H, v4.H[0] // *.............................. - // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. - // ldr q7, [x0], #64 // ....*.......................... - // str q17, [x0, #-16] // ..........*.................... - // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... - // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... - // mul v25.8H, v23.8H, v2.8H // ..*............................ - // mul v0.8H, v7.8H, v2.8H // .........*..................... - // mul v26.8H, v27.8H, v2.8H // ......*........................ - // mls v25.8H, v5.8H, v4.H[0] // .......*....................... - // mls v26.8H, v29.8H, v4.H[0] // ...........*................... - // mls v0.8H, v19.8H, v4.H[0] // .............*................. - // str q25, [x0, #-48] // ............*.................. - // str q0, [x0, #-64] // .................*............. - // str q26, [x0, #-32] // ...............*............... - - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S new file mode 100644 index 000000000..410950730 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_reduce_asm_opt.S @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h +.endm + +/********************************** + * poly_reduce() * + **********************************/ + + ptr .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + modulus_twisted .req v4 + + .text + .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov count, #8 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 + + // Cycle bound: 22.0 + // IPC bound: 0.68 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... + + sub count, count, #1 +poly_reduce_asm_opt_loop: + // Instructions: 32 + // Expected cycles: 36 + // Expected IPC: 0.89 + + // Cycle bound: 36.0 + // IPC bound: 0.89 + + // Wall time: 1.05s + // User time: 1.05s + + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, poly_reduce_asm_opt_loop + // Instructions: 17 + // Expected cycles: 23 + // Expected IPC: 0.74 + + // Cycle bound: 23.0 + // IPC bound: 0.74 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ + str q21, [x0, #-32] // ...*.......................... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ + + + ret + + .unreq ptr + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + + .unreq tmp + .unreq mask + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S new file mode 100644 index 000000000..bc33afd43 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/******************************************** + * poly_tobytes() * + ********************************************/ + + data0 .req v0 + data1 .req v1 + out0 .req v2 + out1 .req v3 + out2 .req v4 + tmp .req v5 + + dst .req x0 + src .req x1 + count .req x2 + + .text + .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): + + mov count, #16 +poly_tobytes_asm_opt_asm_loop_start: + ld2 {data0.8h, data1.8h}, [src], #32 + + // r[3 * i + 0] = (t0 >> 0); + xtn out0.8b, data0.8h + + // r[3 * i + 1] = (t0 >> 8); + shrn out1.8b, data0.8h, #8 + xtn tmp.8b, data1.8h + // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + sli out1.8b, tmp.8b, #4 + + // r[3 * i + 2] = (t1 >> 4); + shrn out2.8b, data1.8h, #4 + + st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 + + subs count, count, #1 + cbnz count, poly_tobytes_asm_opt_asm_loop_start + ret + + .unreq data0 + .unreq data1 + .unreq out0 + .unreq out1 + .unreq out2 + .unreq tmp + .unreq dst + .unreq src + .unreq count + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S new file mode 100644 index 000000000..bcbff9adb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/poly_tomont_asm_opt.S @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/********************************** + * poly_tomont() * + **********************************/ + + src .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 + + factor .req v2 + factor_t .req v3 + modulus .req v4 + modulus_twisted .req v5 + + tmp0 .req v6 + + + .text + .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov wtmp, #-1044 // 2^16 % 3329 + dup factor.8h, wtmp + + mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) + dup factor_t.8h, wtmp + + mov count, #8 + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ + + sub count, count, #1 +poly_tomont_asm_opt_loop: + // Instructions: 20 + // Expected cycles: 24 + // Expected IPC: 0.83 + // + // Cycle bound: 24.0 + // IPC bound: 0.83 + // + // Wall time: 0.73s + // User time: 0.73s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, poly_tomont_asm_opt_loop + // Instructions: 15 + // Expected cycles: 18 + // Expected IPC: 0.83 + // + // Cycle bound: 18.0 + // IPC bound: 0.83 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... + + + ret + + .unreq src + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq factor_t + .unreq modulus + .unreq modulus_twisted + + .unreq tmp0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S new file mode 100644 index 000000000..e336b92cb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt): + push_stack + + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S new file mode 100644 index 000000000..1c30ed6aa --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S new file mode 100644 index 000000000..c3d70ed42 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + add a3_ptr, a0_ptr, #(3 * 512) + add b3_ptr, b0_ptr, #(3 * 512) + add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + + mov count, #(MLKEM_N / 16) + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop + + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S deleted file mode 100644 index 94f0889b7..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_clean.S +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -// -// AArch64 re-implementation of the asymmetric base multiplication from: -// -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. -// -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) -k2_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k2_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) -k3_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k3_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - // - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) -k4_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k4_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq wtmp - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S deleted file mode 100644 index 275ca06d2..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/polyvec_opt.S +++ /dev/null @@ -1,1606 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// AArch64 re-implementation of the asymmetric base multiplication from: - -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. - -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 94 - // Expected IPC: 0.80 - - // Cycle bound: 94.0 - // IPC bound: 0.80 - - // Wall time: 1.49s - // User time: 1.49s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q9, [x4], #32 // *.......................................................................... - ldr q5, [x4, #-16] // ......*.................................................................... - ldr q11, [x5], #32 // .*......................................................................... - uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. - uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... - ldr q5, [x2], #32 // ..*........................................................................ - ldr q7, [x5, #-16] // ..............*............................................................ - ldr q21, [x2, #-16] // ...*....................................................................... - uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... - uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ - uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... - uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... - ldr q21, [x1], #32 // .......*................................................................... - ldr q25, [x1, #-16] // ........*.................................................................. - ld1 {v6.8H}, [x3], #16 // ............................*.............................................. - uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ - uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... - smull v25.4S, v26.4H, v5.4H // ............*.............................................................. - smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. - smull v19.4S, v26.4H, v7.4H // ..........................*................................................ - smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ - smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... - smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... - smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... - smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... - smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... - smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... - smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... - smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... - ld1 {v23.8H}, [x6], #16 // ........................*.................................................. - smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... - smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... - smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... - smlal v19.4S, v9.4H, v23.4H // .........................................*................................. - ldr q9, [x4], #32 // ...............................*........................................... - uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. - uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. - mul v11.8H, v11.8H, v2.8H // ...........................*............................................... - mul v23.8H, v23.8H, v2.8H // ..............................................*............................ - ldr q7, [x5], #32 // ................................*.......................................... - smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. - smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ - ldr q11, [x2], #32 // .....................................*..................................... - ldr q21, [x2, #-16] // ........................................*.................................. - ldr q6, [x4, #-16] // ...............................................*........................... - uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... - ldr q10, [x1], #32 // ................................................*.......................... - ldr q29, [x1, #-16] // .................................................*......................... - uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. - uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... - uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... - uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... - smull v12.4S, v3.4H, v11.4H // ......................................................*.................... - smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... - ldr q21, [x5, #-16] // ........................................................*.................. - smlal v12.4S, v10.4H, v17.4H // .........................................................*................. - smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ - uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... - uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. - smlal v12.4S, v13.4H, v29.4H // .............................................................*............. - smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ - uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... - smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ - smlal v12.4S, v28.4H, v15.4H // .................................................................*......... - smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ - smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... - uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ - smull v23.4S, v3.4H, v17.4H // ......................................................................*.... - uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... - uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... - mul v14.8H, v9.8H, v2.8H // .......................................................................*... - ld1 {v22.8H}, [x6], #16 // ...................................................................*....... - zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* - ld1 {v4.8H}, [x3], #16 // .........................................................................*. - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q18, [x4], #32 // *.......................................................................... - // ldr q30, [x5], #32 // ..*........................................................................ - // ldr q8, [x2], #32 // .....*..................................................................... - // ldr q9, [x2, #-16] // .......*................................................................... - // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ - // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... - // ldr q19, [x4, #-16] // .*......................................................................... - // ldr q29, [x1], #32 // ............*.............................................................. - // ldr q12, [x1, #-16] // .............*............................................................. - // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... - // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... - // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... - // smull v12.4S, v3.4H, v4.4H // .................*......................................................... - // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ - // ldr q5, [x5, #-16] // ......*.................................................................... - // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... - // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... - // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. - // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. - // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. - // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ - // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... - // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ - // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... - // ld1 {v22.8H}, [x6], #16 // .............................*............................................. - // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... - // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... - // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... - // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ - // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. - // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... - // ldr q18, [x4], #32 // ..................................*........................................ - // ldr q30, [x5], #32 // .......................................*................................... - // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. - // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. - // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... - // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. - // ldr q8, [x2], #32 // ..........................................*................................ - // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... - // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... - // ldr q9, [x2, #-16] // ...........................................*............................... - // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... - // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ - // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. - // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... - // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... - // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... - // ldr q19, [x4, #-16] // ............................................*.............................. - // ldr q29, [x1], #32 // ..............................................*............................ - // ldr q12, [x1, #-16] // ...............................................*........................... - // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ - // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... - // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ - // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... - // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... - // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... - // ldr q5, [x5, #-16] // ......................................................*.................... - // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... - // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. - // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. - // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ - // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... - // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. - // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. - // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... - // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... - // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... - // ld1 {v22.8H}, [x6], #16 // .......................................................................*... - // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... - // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... - // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... - // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... - // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - // ld1 {v4.8H}, [x3], #16 // ..........................................................................* - // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - - sub count, count, #2 -1: - // Instructions: 48 - // Expected cycles: 58 - // Expected IPC: 0.83 - - // Cycle bound: 58.0 - // IPC bound: 0.83 - - // Wall time: 6.39s - // User time: 6.39s - - // -------------- original position --------------> - // 0 25 - // |------------------------|---------------------- - smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... - ldr q18, [x4], #32 // .................e.............................. - ldr q30, [x5], #32 // .....................e.......................... - smlal2 v20.4S, v10.8H, v4.8H // ............*................................... - smlal v12.4S, v14.4H, v0.4H // .........................................*...... - smlal v23.4S, v10.4H, v4.4H // ...........*.................................... - str q9, [x0, #16] // ...............................................l - smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... - ldr q8, [x2], #32 // ....e........................................... - smlal v23.4S, v13.4H, v15.4H // ..........................*..................... - smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. - zip1 v26.8H, v19.8H, v27.8H // ............................................l... - ldr q9, [x2, #-16] // .....e.......................................... - smlal v23.4S, v28.4H, v22.4H // ............................*................... - uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... - uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... - uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ - uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. - str q26, [x0], #32 // ..............................................l. - mul v31.8H, v5.8H, v2.8H // ...................................*............ - ldr q19, [x4, #-16] // ..................e............................. - ldr q29, [x1], #32 // e............................................... - ldr q12, [x1, #-16] // .e.............................................. - smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... - uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ - uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. - uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ - smull v12.4S, v3.4H, v4.4H // .............e.................................. - smull2 v11.4S, v3.8H, v4.8H // ..............e................................. - ldr q5, [x5, #-16] // ......................e......................... - smlal v12.4S, v10.4H, v17.4H // ...............e................................ - smlal2 v11.4S, v10.8H, v17.8H // ................e............................... - uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... - uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ - smlal v12.4S, v13.4H, v14.4H // ..............................e................. - smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ - uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... - smlal v23.4S, v31.4H, v0.4H // ....................................*........... - smlal v12.4S, v28.4H, v15.4H // ................................e............... - smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. - ld1 {v22.8H}, [x6], #16 // .........................e...................... - uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... - uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ - smull v23.4S, v3.4H, v17.4H // .........e...................................... - mul v14.8H, v1.8H, v2.8H // ........................................e....... - zip2 v9.8H, v19.8H, v27.8H // .............................................*.. - ld1 {v4.8H}, [x3], #16 // ........e....................................... - smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. - // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. - // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... - // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... - // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... - // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. - // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. - // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. - // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. - // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... - // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. - // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. - // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. - // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. - // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. - // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ - // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. - // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. - // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. - // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. - // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... - // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... - // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... - // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ - // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. - // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. - // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. - // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. - // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. - // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. - // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. - // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. - // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. - // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. - // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. - // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. - // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... - // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... - // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. - // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l - // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ - - sub count, count, #1 - cbnz count, 1b - // Instructions: 21 - // Expected cycles: 35 - // Expected IPC: 0.60 - - // Cycle bound: 35.0 - // IPC bound: 0.60 - - // Wall time: 0.08s - // User time: 0.08s - - // ----- original position -----> - // 0 25 - // |------------------------|---- - smull2 v5.4S, v3.8H, v17.8H // *............................. - smlal v12.4S, v14.4H, v0.4H // ..*........................... - smlal v23.4S, v10.4H, v4.4H // ...*.......................... - str q9, [x0, #16] // ....*......................... - smlal2 v5.4S, v10.8H, v4.8H // .*............................ - uzp2 v11.8H, v12.8H, v11.8H // ..........*................... - zip1 v9.8H, v19.8H, v27.8H // ........*..................... - smlal v23.4S, v13.4H, v15.4H // ......*....................... - smlal2 v5.4S, v13.8H, v15.8H // .....*........................ - str q9, [x0], #32 // ............*................. - smlal v23.4S, v28.4H, v22.4H // .........*.................... - smlal2 v5.4S, v28.8H, v22.8H // .......*...................... - uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. - mul v9.8H, v9.8H, v2.8H // .............*................ - smlal2 v5.4S, v9.8H, v0.8H // ..............*............... - smlal v23.4S, v9.4H, v0.4H // ...............*.............. - uzp2 v9.8H, v23.8H, v5.8H // ................*............. - zip2 v5.8H, v9.8H, v11.8H // .................*............ - zip1 v9.8H, v9.8H, v11.8H // ...................*.......... - str q5, [x0, #16] // ..................*........... - str q9, [x0], #32 // ....................*......... - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // smull2 v20.4S, v3.8H, v17.8H // *.............................. - // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... - // smlal v12.4S, v14.4H, v0.4H // .*............................. - // smlal v23.4S, v10.4H, v4.4H // ..*............................ - // str q9, [x0, #16] // ...*........................... - // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... - // smlal v23.4S, v13.4H, v15.4H // .......*....................... - // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... - // zip1 v26.8H, v19.8H, v27.8H // ......*........................ - // smlal v23.4S, v28.4H, v22.4H // ..........*.................... - // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... - // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. - // str q26, [x0], #32 // .........*..................... - // mul v31.8H, v5.8H, v2.8H // .............*................. - // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ - // smlal v23.4S, v31.4H, v0.4H // ...............*............... - // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. - // zip2 v9.8H, v19.8H, v27.8H // .................*............. - // str q9, [x0, #16] // ...................*........... - // zip1 v26.8H, v19.8H, v27.8H // ..................*............ - // str q26, [x0], #32 // ....................*.......... - - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 103 - // Expected IPC: 0.73 - - // Cycle bound: 103.0 - // IPC bound: 0.73 - - // Wall time: 0.94s - // User time: 0.94s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q7, [x2, #16] // *.......................................................................... - ldr q20, [x2], #32 // ..*........................................................................ - ldr q15, [x1, #16] // .*......................................................................... - uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... - uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... - ld1 {v20.8H}, [x3], #16 // ...*....................................................................... - ldr q30, [x1], #32 // ..............*............................................................ - ldr q11, [x4], #32 // ....*...................................................................... - uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... - uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ - smull v30.4S, v16.4H, v7.4H // ...................*....................................................... - smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... - smull v9.4S, v16.4H, v8.4H // .....................*..................................................... - smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... - smlal v30.4S, v15.4H, v8.4H // .......................*................................................... - smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. - smlal v9.4S, v15.4H, v20.4H // .........................*................................................. - smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ - ldr q20, [x4, #-16] // .....*..................................................................... - ldr q15, [x5], #32 // ......*.................................................................... - uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... - uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. - ldr q11, [x5, #-16] // .......*................................................................... - ld1 {v27.8H}, [x6], #16 // ........*.................................................................. - uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. - uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ - smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... - smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... - smlal v30.4S, v8.4H, v15.4H // .................................*......................................... - smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ - smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... - smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... - smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... - smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... - ldr q20, [x7], #32 // .........*................................................................. - ldr q15, [x7, #-16] // ..........*................................................................ - ldr q8, [x8], #32 // ...........*............................................................... - uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... - uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. - ldr q15, [x8, #-16] // ............*.............................................................. - ld1 {v27.8H}, [x9], #16 // .............*............................................................. - uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. - uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ - smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... - smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. - smlal v30.4S, v11.4H, v15.4H // .............................................*............................. - smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ - smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... - smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... - smlal v30.4S, v20.4H, v10.4H // .................................................*......................... - smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ - ldr q15, [x2], #32 // ...............................................................*........... - uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... - uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... - mul v20.8H, v20.8H, v2.8H // ......................................................*.................... - mul v8.8H, v8.8H, v2.8H // .......................................................*................... - ldr q21, [x4], #32 // .................................................................*......... - smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. - smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. - smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ - smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... - ldr q6, [x4, #-16] // ..................................................................*........ - uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. - uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. - ldr q16, [x2, #-16] // ...................................................*....................... - ldr q30, [x1, #16] // ..............................................................*............ - ld1 {v9.8H}, [x3], #16 // ................................................................*.......... - ldr q1, [x5], #32 // ...................................................................*....... - ldr q12, [x5, #-16] // ....................................................................*...... - ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - ldr q19, [x7], #32 // ......................................................................*.... - ldr q31, [x7, #-16] // .......................................................................*... - ldr q17, [x8], #32 // ........................................................................*.. - ldr q18, [x8, #-16] // .........................................................................*. - ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q16, [x2, #16] // *.......................................................................... - // ldr q30, [x1, #16] // ..*........................................................................ - // ldr q15, [x2], #32 // .*......................................................................... - // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... - // ldr q21, [x4], #32 // .......*................................................................... - // ldr q6, [x4, #-16] // ..................*........................................................ - // ldr q1, [x5], #32 // ...................*....................................................... - // ldr q12, [x5, #-16] // ......................*.................................................... - // ld1 {v24.8H}, [x6], #16 // .......................*................................................... - // ldr q19, [x7], #32 // ..................................*........................................ - // ldr q31, [x7, #-16] // ...................................*....................................... - // ldr q17, [x8], #32 // ....................................*...................................... - // ldr q18, [x8, #-16] // .......................................*................................... - // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. - // ldr q20, [x1], #32 // ......*.................................................................... - // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. - // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. - // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ - // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... - // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. - // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. - // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ - // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... - // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... - // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... - // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... - // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. - // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. - // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ - // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... - // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. - // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. - // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ - // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... - // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... - // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... - // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... - // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... - // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. - // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ - // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... - // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. - // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ - // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... - // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... - // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ - // ldr q16, [x2, #16] // ................................................................*.......... - // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... - // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... - // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... - // mul v20.8H, v20.8H, v2.8H // .......................................................*................... - // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. - // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ - // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... - // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. - // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ - // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... - // ldr q30, [x1, #16] // .................................................................*......... - // ldr q15, [x2], #32 // ...................................................*....................... - // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ - // ldr q21, [x4], #32 // ........................................................*.................. - // ldr q6, [x4, #-16] // .............................................................*............. - // ldr q1, [x5], #32 // ...................................................................*....... - // ldr q12, [x5, #-16] // ....................................................................*...... - // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - // ldr q19, [x7], #32 // ......................................................................*.... - // ldr q31, [x7, #-16] // .......................................................................*... - // ldr q17, [x8], #32 // ........................................................................*.. - // ldr q18, [x8, #-16] // .........................................................................*. - // ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - sub count, count, #2 -1: - // Instructions: 65 - // Expected cycles: 80 - // Expected IPC: 0.81 - - // Cycle bound: 80.0 - // IPC bound: 0.81 - - // Wall time: 11.64s - // User time: 11.64s - - // ---------------------- original position -----------------------> - // 0 25 50 - // |------------------------|------------------------|-------------- - ldr q20, [x1], #32 // *................................................................ - uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... - uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... - uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. - uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. - smull v30.4S, v8.4H, v15.4H // .............*................................................... - smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. - smull v11.4S, v8.4H, v7.4H // .........*....................................................... - smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... - smlal v30.4S, v20.4H, v7.4H // ...............*................................................. - smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ - smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... - smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... - uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. - uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ - uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... - uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ - smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... - smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... - smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. - smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. - smlal v11.4S, v20.4H, v24.4H // ............................*.................................... - smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... - smlal v30.4S, v20.4H, v16.4H // ................................*................................ - smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... - uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ - uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... - uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ - uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... - smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... - smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... - smlal v30.4S, v7.4H, v9.4H // ...............................................*................. - smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ - smlal v11.4S, v20.4H, v25.4H // .............................................*................... - smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. - smlal v30.4S, v20.4H, v16.4H // .................................................*............... - smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. - ldr q16, [x2, #16] // .....e........................................................... - uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. - uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ - mul v7.8H, v7.8H, v2.8H // ....................................................*............ - mul v20.8H, v20.8H, v2.8H // .........................................................*....... - zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. - zip1 v27.8H, v27.8H, v10.8H // .............................................................l... - smlal v11.4S, v7.4H, v0.4H // .....................................................*........... - smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... - smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... - smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... - str q27, [x0], #32 // ...............................................................l. - uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... - str q9, [x0, #-16] // ................................................................l - uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... - ldr q30, [x1, #16] // .e............................................................... - ldr q15, [x2], #32 // ....e............................................................ - ld1 {v9.8H}, [x3], #16 // ........e........................................................ - ldr q21, [x4], #32 // .................e............................................... - ldr q6, [x4, #-16] // ..................e.............................................. - ldr q1, [x5], #32 // .....................e........................................... - ldr q12, [x5, #-16] // ......................e.......................................... - ld1 {v24.8H}, [x6], #16 // .........................e....................................... - ldr q19, [x7], #32 // ..................................e.............................. - ldr q31, [x7, #-16] // ...................................e............................. - ldr q17, [x8], #32 // ......................................e.......................... - ldr q18, [x8, #-16] // .......................................e......................... - ld1 {v25.8H}, [x9], #16 // ..........................................e...................... - - // ---------------------------------------------------------------- new position -----------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ - // ldr q12, [x1], #32 // ............................*................................................................~.................................................. - // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. - // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. - // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. - // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ - // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. - // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... - // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... - // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... - // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... - // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. - // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ - // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ - // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. - // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... - // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. - // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ - // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. - // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... - // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. - // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ - // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. - // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... - // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... - // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. - // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. - // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ - // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... - // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... - // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... - // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. - // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... - // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... - // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... - // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... - // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. - // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... - // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ - // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. - // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l - - sub count, count, #1 - cbnz count, 1b - // Instructions: 55 - // Expected cycles: 61 - // Expected IPC: 0.90 - - // Cycle bound: 61.0 - // IPC bound: 0.90 - - // Wall time: 8.41s - // User time: 8.41s - - // ----------------- original position ------------------> - // 0 25 50 - // |------------------------|------------------------|---- - ldr q7, [x1], #32 // *...................................................... - uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... - uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... - uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. - smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. - smull v5.4S, v23.4H, v20.4H // .......*............................................... - smull2 v30.4S, v23.8H, v15.8H // ......*................................................ - uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... - smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... - smlal v5.4S, v11.4H, v9.4H // ...........*........................................... - uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... - smull v16.4S, v23.4H, v15.4H // .....*................................................. - smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... - smlal v5.4S, v3.4H, v28.4H // .................*..................................... - uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ - uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... - smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ - uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. - smlal v16.4S, v11.4H, v20.4H // .........*............................................. - smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ - smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ - uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... - uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ - smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. - smlal v16.4S, v3.4H, v20.4H // ...................*................................... - smlal v5.4S, v29.4H, v24.4H // .....................*................................. - uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... - smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. - smlal v16.4S, v29.4H, v28.4H // .......................*............................... - smlal v5.4S, v14.4H, v7.4H // .............................*......................... - smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... - smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... - smlal v16.4S, v14.4H, v9.4H // ...............................*....................... - smlal v5.4S, v21.4H, v25.4H // .................................*..................... - zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ - smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. - smlal v16.4S, v21.4H, v7.4H // ...................................*................... - uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. - str q20, [x0], #32 // ...............................................*....... - mul v15.8H, v7.8H, v2.8H // .......................................*............... - uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ - zip2 v31.8H, v27.8H, v10.8H // .........................................*............. - mul v20.8H, v7.8H, v2.8H // ........................................*.............. - smlal v5.4S, v15.4H, v0.4H // ...........................................*........... - smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... - str q31, [x0, #-16] // .................................................*..... - smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ - smlal v16.4S, v20.4H, v0.4H // .............................................*......... - uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... - uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... - zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. - zip2 v20.8H, v15.8H, v20.8H // ...................................................*... - str q7, [x0], #32 // .....................................................*. - str q20, [x0, #-16] // ......................................................* - - // -------------------- new position --------------------> - // 0 25 50 - // |------------------------|------------------------|---- - // ldr q20, [x1], #32 // *...................................................... - // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... - // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. - // smull v30.4S, v8.4H, v15.4H // ............*.......................................... - // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... - // smull v11.4S, v8.4H, v7.4H // ......*................................................ - // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. - // smlal v30.4S, v20.4H, v7.4H // ...................*................................... - // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. - // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ - // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. - // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... - // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. - // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ - // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ - // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... - // smlal v30.4S, v7.4H, v9.4H // .........................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. - // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ - // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... - // smlal v30.4S, v20.4H, v16.4H // .............................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... - // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... - // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... - // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... - // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... - // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ - // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. - // smlal v30.4S, v7.4H, v9.4H // .................................*..................... - // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... - // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... - // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... - // smlal v30.4S, v20.4H, v16.4H // .....................................*................. - // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. - // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ - // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. - // mul v7.8H, v7.8H, v2.8H // ........................................*.............. - // mul v20.8H, v20.8H, v2.8H // ...........................................*........... - // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ - // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... - // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... - // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... - // smlal v30.4S, v20.4H, v0.4H // ................................................*...... - // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... - // str q27, [x0], #32 // .......................................*............... - // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... - // str q9, [x0, #-16] // ..............................................*........ - // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... - // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. - // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... - // str q27, [x0], #32 // .....................................................*. - // str q9, [x0, #-16] // ......................................................* - - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) - // Instructions: 114 - // Expected cycles: 153 - // Expected IPC: 0.75 - // - // Cycle bound: 153.0 - // IPC bound: 0.75 - // - // Wall time: 0.69s - // User time: 0.69s - // - // ----------------------------------------------- original position -----------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - ldr q23, [x2, #16] // .*................................................................................................................ - ldr q19, [x2], #32 // *................................................................................................................. - ldr q17, [x5], #32 // ..*............................................................................................................... - uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... - uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... - ldr q23, [x5, #-16] // ...*.............................................................................................................. - ldr q30, [x1, #16] // .....*............................................................................................................ - uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. - uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... - ldr q17, [x1], #32 // ......*........................................................................................................... - ldr q10, [x7, #16] // .............*.................................................................................................... - uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... - uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ - smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... - smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... - smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ - smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... - smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. - smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. - ldr q19, [x4], #32 // ....................*............................................................................................. - ldr q16, [x4, #-16] // .....................*............................................................................................ - ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. - uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... - uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... - smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ - smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... - smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... - smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ - smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. - smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... - smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... - smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ - ldr q23, [x7], #32 // ......................*........................................................................................... - ldr q17, [x8, #16] // ..............*................................................................................................... - uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... - uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. - ldr q10, [x10], #32 // ...............*.................................................................................................. - ldr q16, [x10, #-16] // ................*................................................................................................. - ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ - uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... - uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. - ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ - ldr q3, [x11, #16] // ...........................*...................................................................................... - smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... - smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... - ldr q19, [x11], #32 // ............................*..................................................................................... - ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... - uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... - uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... - ldr q3, [x8], #32 // ..............................*................................................................................... - ldr q31, [x2], #32 // ......................................*........................................................................... - uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. - uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ - smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... - smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... - smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... - smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... - smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... - smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. - smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. - smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ - smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... - smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. - smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. - smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ - smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... - smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... - smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... - smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ - ldr q19, [x2, #-16] // .........................................*........................................................................ - uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... - uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. - mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... - uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. - uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. - mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ - smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ - smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... - ldr q23, [x5], #32 // .............................................*.................................................................... - smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... - uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... - smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... - ldr q17, [x5, #-16] // ................................................*................................................................. - ldr q13, [x1, #16] // ......................................................*........................................................... - uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. - uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... - uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. - ldr q23, [x1], #32 // ..........................................................................*....................................... - zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* - ldr q3, [x7, #16] // ........................................................................................*......................... - uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... - uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. - smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... - ldr q6, [x8, #16] // .........................................................................................*........................ - ldr q23, [x10], #32 // ..........................................................................................*....................... - smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... - ldr q17, [x10, #-16] // ...........................................................................................*...................... - ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... - uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... - uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... - ldr q23, [x4], #32 // ...............................................................................................*.................. - ldr q17, [x4, #-16] // ................................................................................................*................. - ldr q4, [x7], #32 // .................................................................................................*................ - uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... - uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. - uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ - smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... - ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. - ldr q25, [x11, #16] // ......................................................................................................*........... - ldr q29, [x11], #32 // .......................................................................................................*.......... - ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. - ldr q14, [x8], #32 // .........................................................................................................*........ - ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q3, [x2], #32 // .*................................................................................................................ - // ldr q17, [x2, #-16] // *................................................................................................................. - // ldr q21, [x5], #32 // ..*............................................................................................................... - // ldr q19, [x5, #-16] // .....*............................................................................................................ - // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... - // ldr q25, [x1, #16] // ......*........................................................................................................... - // ldr q22, [x1], #32 // .........*........................................................................................................ - // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... - // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... - // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... - // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. - // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. - // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... - // ldr q3, [x7, #16] // ..........*....................................................................................................... - // ldr q6, [x8, #16] // .................................*................................................................................ - // ldr q8, [x10], #32 // ....................................*............................................................................. - // ldr q26, [x10, #-16] // .....................................*............................................................................ - // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... - // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... - // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... - // ldr q8, [x4], #32 // ...................*.............................................................................................. - // ldr q26, [x4, #-16] // ....................*............................................................................................. - // ldr q4, [x7], #32 // ................................*................................................................................. - // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... - // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... - // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ - // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... - // ldr q25, [x11, #16] // ..........................................*....................................................................... - // ldr q29, [x11], #32 // .............................................*.................................................................... - // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... - // ldr q14, [x8], #32 // .................................................*................................................................ - // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ - // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ - // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. - // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... - // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. - // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. - // ldr q3, [x2], #32 // ..................................................*............................................................... - // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. - // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... - // ldr q17, [x2, #-16] // .....................................................................*............................................ - // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. - // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... - // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... - // ldr q21, [x5], #32 // ..............................................................................*................................... - // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... - // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... - // ldr q19, [x5, #-16] // ..................................................................................*............................... - // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... - // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ - // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. - // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. - // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. - // ldr q25, [x1, #16] // ...................................................................................*.............................. - // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... - // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... - // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. - // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ - // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... - // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... - // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... - // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ - // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... - // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... - // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... - // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... - // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... - // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. - // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. - // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ - // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... - // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. - // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. - // ldr q22, [x1], #32 // .......................................................................................*.......................... - // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... - // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ - // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... - // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... - // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... - // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ - // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... - // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... - // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... - // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... - // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... - // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. - // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... - // ldr q3, [x7, #16] // .........................................................................................*........................ - // ldr q6, [x8, #16] // .............................................................................................*.................... - // ldr q8, [x10], #32 // ..............................................................................................*................... - // ldr q26, [x10, #-16] // ................................................................................................*................. - // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ - // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... - // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. - // ldr q8, [x4], #32 // ....................................................................................................*............. - // ldr q26, [x4, #-16] // .....................................................................................................*............ - // ldr q4, [x7], #32 // ......................................................................................................*........... - // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... - // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... - // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... - // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ - // ldr q25, [x11, #16] // ............................................................................................................*..... - // ldr q29, [x11], #32 // .............................................................................................................*.... - // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... - // ldr q14, [x8], #32 // ................................................................................................................*. - // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. - // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. - // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ - // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* - // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. - // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... - - sub count, count, #2 -1: - // Instructions: 82 - // Expected cycles: 102 - // Expected IPC: 0.80 - // - // Cycle bound: 102.0 - // IPC bound: 0.80 - // - // Wall time: 15.93s - // User time: 15.93s - // - // ------------------------------- original position -------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------ - smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ - uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ - smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... - ldr q3, [x2], #32 // ....e............................................................................. - uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... - smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... - ldr q17, [x2, #-16] // .....e............................................................................ - smull v18.4S, v31.4H, v19.4H // .........*........................................................................ - smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... - smull v29.4S, v31.4H, v21.4H // .............*.................................................................... - ldr q21, [x5], #32 // .....................e............................................................ - smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... - smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. - ldr q19, [x5, #-16] // ......................e........................................................... - smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... - smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... - uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... - uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... - smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... - ldr q25, [x1, #16] // .e................................................................................ - smlal v29.4S, v26.4H, v28.4H // ................................*................................................. - smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... - uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ - smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... - smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. - smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. - smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... - smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... - smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... - smlal v29.4S, v4.4H, v31.4H // .................................................*................................ - smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... - smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... - smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ - smlal v29.4S, v30.4H, v1.4H // ................................................................*................. - smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... - smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. - smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. - smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... - smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... - ldr q22, [x1], #32 // e................................................................................. - uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ - uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... - mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... - uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... - uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. - uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... - smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... - smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... - uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... - uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. - zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. - mul v23.8H, v26.8H, v2.8H // .....................................................................*............ - uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... - smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... - str q14, [x0, #16] // .................................................................................l - ldr q3, [x7, #16] // ...................................e.............................................. - ldr q6, [x8, #16] // .......................................e.......................................... - ldr q8, [x10], #32 // ...................................................e.............................. - ldr q26, [x10, #-16] // ....................................................e............................. - ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... - uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ - uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... - ldr q8, [x4], #32 // .................e................................................................ - ldr q26, [x4, #-16] // ..................e............................................................... - ldr q4, [x7], #32 // ..................................e............................................... - uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. - uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. - ld1 {v8.8H}, [x6], #16 // .........................e........................................................ - uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. - ldr q25, [x11, #16] // ........................................................e......................... - ldr q29, [x11], #32 // .......................................................e.......................... - ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... - ldr q14, [x8], #32 // ......................................e........................................... - smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. - smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... - smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... - ld1 {v23.8H}, [x3], #16 // ........e......................................................................... - smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. - uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ - str q5, [x0], #32 // ................................................................................l. - zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... - - // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- - // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... - // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... - // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. - // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ - // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... - // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... - // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... - // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. - // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... - // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... - // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... - // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... - // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. - // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. - // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. - // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... - // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... - // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... - // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. - // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. - // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ - // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... - // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... - // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ - // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ - // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ - // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... - // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... - // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ - // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ - // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ - // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... - // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... - // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... - // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... - // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... - // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ - // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... - // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... - // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... - // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... - // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... - // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... - // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... - // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... - // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. - // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ - // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... - // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. - // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... - // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. - // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... - // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ - // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... - // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ - // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... - // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. - // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... - // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... - // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. - // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ - // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... - // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. - // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. - // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ - // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ - // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. - // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l - // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... - - sub count, count, #1 - cbnz count, 1b - // Instructions: 50 - // Expected cycles: 56 - // Expected IPC: 0.89 - // - // Cycle bound: 56.0 - // IPC bound: 0.89 - // - // Wall time: 4.16s - // User time: 4.16s - // - // --------------- original position ---------------> - // 0 25 - // |------------------------| - smull2 v17.4S, v31.8H, v19.8H // ..*............................................... - uzp2 v1.8H, v14.8H, v6.8H // ................*................................. - smull v18.4S, v31.4H, v21.4H // .......*.......................................... - smlal2 v24.4S, v26.8H, v28.8H // *................................................. - smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. - smull v21.4S, v31.4H, v19.4H // .....*............................................ - smlal v18.4S, v16.4H, v19.4H // .........*........................................ - uzp2 v31.8H, v4.8H, v3.8H // .*................................................ - uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... - smlal v21.4S, v16.4H, v23.4H // ..........*....................................... - smlal v18.4S, v20.4H, v27.4H // ...........*...................................... - uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. - smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... - smlal v21.4S, v20.4H, v28.4H // .............*.................................... - smlal v18.4S, v26.4H, v28.4H // ..............*................................... - smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... - smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... - smlal v21.4S, v26.4H, v8.4H // ...............*.................................. - smlal v18.4S, v9.4H, v1.4H // ...................*.............................. - smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... - smlal2 v17.4S, v9.8H, v3.8H // .................*................................ - smlal v21.4S, v9.4H, v3.4H // ....................*............................. - smlal v18.4S, v31.4H, v3.4H // .......................*.......................... - smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... - smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ - smlal v21.4S, v31.4H, v12.4H // ........................*......................... - smlal v18.4S, v30.4H, v14.4H // ...........................*...................... - smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... - smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ - smlal v21.4S, v30.4H, v10.4H // ............................*..................... - smlal v18.4S, v11.4H, v10.4H // ...............................*.................. - zip2 v19.8H, v7.8H, v15.8H // ......................................*........... - smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... - smlal v21.4S, v11.4H, v22.4H // ................................*................. - uzp1 v23.8H, v18.8H, v24.8H // .................................*................ - str q19, [x0, #16] // .........................................*........ - mul v19.8H, v23.8H, v2.8H // ..................................*............... - uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ - str q5, [x0], #32 // .............................................*.... - mul v26.8H, v23.8H, v2.8H // .......................................*.......... - smlal v18.4S, v19.4H, v0.4H // ...................................*.............. - smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. - smlal v21.4S, v26.4H, v0.4H // ...........................................*...... - smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... - uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... - uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... - zip1 v23.8H, v19.8H, v13.8H // ..............................................*... - zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. - str q23, [x0], #32 // .................................................* - str q19, [x0, #-16] // ................................................*. - - // ----------------- new position ------------------> - // 0 25 - // |------------------------|------------------------ - // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. - // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... - // smull2 v13.4S, v31.8H, v19.8H // *................................................. - // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... - // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. - // smull v18.4S, v31.4H, v19.4H // .....*............................................ - // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... - // smull v29.4S, v31.4H, v21.4H // ..*............................................... - // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. - // smlal v29.4S, v16.4H, v19.4H // ......*........................................... - // smlal v18.4S, v16.4H, v23.4H // .........*........................................ - // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... - // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... - // smlal v18.4S, v20.4H, v28.4H // .............*.................................... - // smlal v29.4S, v26.4H, v28.4H // ..............*................................... - // smlal v18.4S, v26.4H, v8.4H // .................*................................ - // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ - // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. - // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. - // smlal v29.4S, v9.4H, v26.4H // ..................*............................... - // smlal v18.4S, v9.4H, v31.4H // .....................*............................ - // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... - // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. - // smlal v29.4S, v4.4H, v31.4H // ......................*........................... - // smlal v18.4S, v4.4H, v12.4H // .........................*........................ - // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... - // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... - // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... - // smlal v18.4S, v30.4H, v10.4H // .............................*.................... - // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. - // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... - // smlal v29.4S, v11.4H, v10.4H // ..............................*................... - // smlal v18.4S, v11.4H, v22.4H // .................................*................ - // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... - // mul v19.8H, v31.8H, v2.8H // ....................................*............. - // smlal v29.4S, v19.4H, v0.4H // ........................................*......... - // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ - // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ - // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. - // mul v23.8H, v26.8H, v2.8H // .......................................*.......... - // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... - // str q14, [x0, #16] // ...................................*.............. - // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... - // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... - // str q5, [x0], #32 // ......................................*........... - // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... - // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. - // str q14, [x0, #16] // .................................................* - // str q5, [x0], #32 // ................................................*. - - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq wtmp - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 8302d2a3e..f2451815a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -19,8 +19,8 @@ * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite @@ -112,9 +112,9 @@ mlkem_q .req v30 bits .req v31 -.text -.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) + .balign 4 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack @@ -402,5 +402,5 @@ return: .unreq mlkem_q .unreq bits -#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ +/* simpasm: footer-start */ +#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c index becdf303b..592c15fb0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/aarch64/src/rej_uniform_table.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_aarch64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_ref/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S index 5fdc3d0a0..3063d20ae 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/basemul.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" @@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(basemul_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp @@ -133,4 +135,5 @@ schoolbook 3 mov %r8,%rsp ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S index 7b1f22624..e74199930 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/intt.S @@ -12,6 +12,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(invntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -252,4 +254,5 @@ intt_level6 0 intt_level6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S index 5d928b4cc..70582fbc1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntt.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(ntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -216,4 +218,5 @@ levels1t6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S new file mode 100644 index 000000000..71f2af000 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttfrombytes.S @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S new file mode 100644 index 000000000..4c10ef366 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttpack.S @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttpack_avx2): +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S new file mode 100644 index 000000000..4f0b01e83 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/ntttobytes.S @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S new file mode 100644 index 000000000..0cf45c671 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/nttunpack.S @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttunpack_avx2): +call nttunpack128_avx2 +add $256,%rdi +call nttunpack128_avx2 +ret + +nttunpack128_avx2: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S new file mode 100644 index 000000000..78bad0559 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/reduce.S @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation based on Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +// Changes: +// - Add call to csub in reduce128_avx to produce outputs +// in [0,1,...,q-1] rather than [0,1,...,q], matching the +// semantics of poly_reduce(). + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" + +.text +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(reduce_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 +call reduce128_avx2 +add $256,%rdi +call reduce128_avx2 +ret + +reduce128_avx2: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 + +csubq 2 +csubq 3 +csubq 4 +csubq 5 +csubq 6 +csubq 7 +csubq 8 +csubq 9 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S deleted file mode 100644 index 9bcd04896..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/shuffle.S +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// Implementation from Kyber reference repository -// https://github.com/pq-crystals/kyber/blob/main/avx2 - -#include "../../../common.h" - -#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -#include "consts.h" -#include "fq.inc" -#include "shuffle.inc" - -.global MLKEM_ASM_NAMESPACE(nttpack_avx2) -MLKEM_ASM_NAMESPACE(nttpack_avx2): -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -nttunpack128_avx2: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) -MLKEM_ASM_NAMESPACE(nttunpack_avx2): -call nttunpack128_avx2 -add $256,%rdi -call nttunpack128_avx2 -ret - -ntttobytes128_avx: -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) -MLKEM_ASM_NAMESPACE(ntttobytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 -call ntttobytes128_avx -add $256,%rsi -add $192,%rdi -call ntttobytes128_avx -ret - -nttfrombytes128_avx: -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) -MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 -call nttfrombytes128_avx -add $256,%rdi -add $192,%rsi -call nttfrombytes128_avx -ret - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S similarity index 64% rename from src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S index 3f013a5fa..7774cec0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-512_x86_64/native/x86_64/src/tomont.S @@ -14,63 +14,24 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "consts.h" +/* simpasm: header-end */ +#include "consts.h" #include "fq.inc" .text -reduce128_avx2: -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 -red16 3 -red16 4 -red16 5 -red16 6 -red16 7 -red16 8 -red16 9 - -csubq 2 -csubq 3 -csubq 4 -csubq 5 -csubq 6 -csubq 7 -csubq 8 -csubq 9 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(reduce_avx2) -MLKEM_ASM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 -call reduce128_avx2 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx2 add $256,%rdi -call reduce128_avx2 +call tomont128_avx2 ret - tomont128_avx2: #load vmovdqa (%rdi),%ymm3 @@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_ASM_NAMESPACE(tomont_avx2) -MLKEM_ASM_NAMESPACE(tomont_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 -call tomont128_avx2 -add $256,%rdi -call tomont128_avx2 -ret - +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md index e499a4a22..a420f05b6 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/README.md @@ -10,10 +10,9 @@ works: - _Fast and Clean: Auditable high-performance assembly via constraint solving_, Amin Abdulrahman, Hanno Becker, Matthias J. Kannwischer, Fabien Klein, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303) -## Profiles -This backend comes with two profiles: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to -read and modify; for example, is heavily leverages register aliases and assembly macros. The optimized profile is -automatically generated from the clean profile via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the +## Variants + +This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, is heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the target architecture is Cortex-A55, but you can easily re-optimize the code for a different microarchitecture supported -by SLOTHY, by adjusting the parameters in [optimize.sh](src/optimize.sh). +by SLOTHY, by adjusting the parameters in [optimize.sh](../../../test/aarch64_clean/src/optimize.sh). diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h deleted file mode 100644 index f124702a4..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/clean.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_H - -/* Identifier for this backend so that source and assembly files - * in the build can be appropriately guarded. */ -#define MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN - -#define MLKEM_NATIVE_ARITH_BACKEND_NAME AARCH64_CLEAN - -/* Filename of the C backend implementation. - * This is not inlined here because this header is included in assembly - * files as well. */ -#define MLKEM_NATIVE_ARITH_BACKEND_IMPL "native/aarch64/src/clean_impl.h" - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h index a7217163f..4a0243279 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/opt.h @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -/* ML-KEM arithmetic native profile for clean assembly */ - #ifdef MLKEM_NATIVE_ARITH_PROFILE_H #error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? #else diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c index 2c1bb31e1..23e7949d3 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/aarch64_zetas.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h index ed0825892..60779598d 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/arith_native_aarch64.h @@ -29,62 +29,49 @@ extern const int16_t aarch64_zetas_mulcache_native[]; extern const int16_t aarch64_zetas_mulcache_twisted_native[]; extern const uint8_t rej_uniform_table[]; -#define ntt_asm_clean MLKEM_NAMESPACE(ntt_asm_clean) -void ntt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define ntt_asm_opt MLKEM_NAMESPACE(ntt_asm_opt) void ntt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define intt_asm_clean MLKEM_NAMESPACE(intt_asm_clean) -void intt_asm_clean(int16_t *, const int16_t *, const int16_t *); - #define intt_asm_opt MLKEM_NAMESPACE(intt_asm_opt) void intt_asm_opt(int16_t *, const int16_t *, const int16_t *); -#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) -unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, - const uint8_t *table); - -#define poly_reduce_asm_clean MLKEM_NAMESPACE(poly_reduce_asm_clean) -void poly_reduce_asm_clean(int16_t *); - #define poly_reduce_asm_opt MLKEM_NAMESPACE(poly_reduce_asm_opt) void poly_reduce_asm_opt(int16_t *); -#define poly_tomont_asm_clean MLKEM_NAMESPACE(poly_tomont_asm_clean) -void poly_tomont_asm_clean(int16_t *); - #define poly_tomont_asm_opt MLKEM_NAMESPACE(poly_tomont_asm_opt) void poly_tomont_asm_opt(int16_t *); -#define poly_mulcache_compute_asm_clean \ - MLKEM_NAMESPACE(poly_mulcache_compute_asm_clean) -void poly_mulcache_compute_asm_clean(int16_t *, const int16_t *, - const int16_t *, const int16_t *); - - #define poly_mulcache_compute_asm_opt \ MLKEM_NAMESPACE(poly_mulcache_compute_asm_opt) void poly_mulcache_compute_asm_opt(int16_t *, const int16_t *, const int16_t *, const int16_t *); -#define poly_tobytes_asm_clean MLKEM_NAMESPACE(poly_tobytes_asm_clean) -void poly_tobytes_asm_clean(uint8_t *r, const int16_t *a); - #define poly_tobytes_asm_opt MLKEM_NAMESPACE(poly_tobytes_asm_opt) void poly_tobytes_asm_opt(uint8_t *r, const int16_t *a); -#define polyvec_basemul_acc_montgomery_cached_asm_clean \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) -void polyvec_basemul_acc_montgomery_cached_asm_clean(int16_t *r, - const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define polyvec_basemul_acc_montgomery_cached_asm_k2_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k2_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k3_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k3_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); + +#define polyvec_basemul_acc_montgomery_cached_asm_k4_opt \ + MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) +void polyvec_basemul_acc_montgomery_cached_asm_k4_opt(int16_t *r, + const int16_t *a, + const int16_t *b, + const int16_t *b_cache); -#define polyvec_basemul_acc_montgomery_cached_asm_opt \ - MLKEM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) -void polyvec_basemul_acc_montgomery_cached_asm_opt(int16_t *r, const int16_t *a, - const int16_t *b, - const int16_t *b_cache); +#define rej_uniform_asm_clean MLKEM_NAMESPACE(rej_uniform_asm_clean) +unsigned rej_uniform_asm_clean(int16_t *r, const uint8_t *buf, unsigned buflen, + const uint8_t *table); #endif /* MLKEM_AARCH64_NATIVE_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h deleted file mode 100644 index 4be90fb24..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/clean_impl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -/* ML-KEM arithmetic native profile for clean assembly */ - -#ifdef MLKEM_NATIVE_ARITH_PROFILE_IMPL_H -#error Only one MLKEM_ARITH assembly profile can be defined -- did you include multiple profiles? -#else -#define MLKEM_NATIVE_ARITH_PROFILE_IMPL_H - -#include "arith_native_aarch64.h" - -/* Set of primitives that this backend replaces */ -#define MLKEM_USE_NATIVE_NTT -#define MLKEM_USE_NATIVE_INTT -#define MLKEM_USE_NATIVE_POLY_REDUCE -#define MLKEM_USE_NATIVE_POLY_TOMONT -#define MLKEM_USE_NATIVE_POLY_MULCACHE_COMPUTE -#define MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED -#define MLKEM_USE_NATIVE_POLY_TOBYTES -#define MLKEM_USE_NATIVE_REJ_UNIFORM - -static INLINE void ntt_native(int16_t data[MLKEM_N]) -{ - ntt_asm_clean(data, aarch64_ntt_zetas_layer01234, aarch64_ntt_zetas_layer56); -} - -static INLINE void intt_native(int16_t data[MLKEM_N]) -{ - intt_asm_clean(data, aarch64_invntt_zetas_layer01234, - aarch64_invntt_zetas_layer56); -} - -static INLINE void poly_reduce_native(int16_t data[MLKEM_N]) -{ - poly_reduce_asm_clean(data); -} - -static INLINE void poly_tomont_native(int16_t data[MLKEM_N]) -{ - poly_tomont_asm_clean(data); -} - -static INLINE void poly_mulcache_compute_native(int16_t x[MLKEM_N / 2], - const int16_t y[MLKEM_N]) -{ - poly_mulcache_compute_asm_clean(x, y, aarch64_zetas_mulcache_native, - aarch64_zetas_mulcache_twisted_native); -} - -static INLINE void polyvec_basemul_acc_montgomery_cached_native( - int16_t r[MLKEM_N], const int16_t a[MLKEM_K * MLKEM_N], - const int16_t b[MLKEM_K * MLKEM_N], - const int16_t b_cache[MLKEM_K * (MLKEM_N / 2)]) -{ - polyvec_basemul_acc_montgomery_cached_asm_clean(r, a, b, b_cache); -} - -static INLINE void poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES], - const int16_t a[MLKEM_N]) -{ - poly_tobytes_asm_clean(r, a); -} - -static INLINE int rej_uniform_native(int16_t *r, unsigned len, - const uint8_t *buf, unsigned buflen) -{ - if (len != MLKEM_N || buflen % 24 != 0) - { - return -1; - } - return (int)rej_uniform_asm_clean(r, buf, buflen, rej_uniform_table); -} - -#endif /* MLKEM_NATIVE_ARITH_PROFILE_IMPL_H */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S deleted file mode 100644 index b0ae1ad46..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_clean.S +++ /dev/null @@ -1,389 +0,0 @@ -/// Copyright (c) 2024 The mlkem-native project authors -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro gs_butterfly a, b, root, idx0, idx1 - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmodq \b, tmp, \root, \idx0, \idx1 -.endm - -.macro gs_butterfly_v a, b, root, root_twisted - sub tmp.8h, \a\().8h, \b\().8h - add \a\().8h, \a\().8h, \b\().8h - mulmod \b, tmp, \root, \root_twisted -.endm - -.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3 - mulmod \dst0, \src0, ninv, ninv_tw - mulmod \dst1, \src1, ninv, ninv_tw - mulmod \dst2, \src2, ninv, ninv_tw - mulmod \dst3, \src3, ninv, ninv_tw -.endm - -.macro barrett_reduce a - sqdmulh t0.8h, \a\().8h, consts.h[1] - srshr t0.8h, t0.8h, #11 - mls \a\().8h, t0.8h, consts.h[0] -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro transpose_single data_out, data_in - trn1 \data_out\()0.4s, \data_in\()0.4s, \data_in\()1.4s - trn2 \data_out\()1.4s, \data_in\()0.4s, \data_in\()1.4s - trn1 \data_out\()2.4s, \data_in\()2.4s, \data_in\()3.4s - trn2 \data_out\()3.4s, \data_in\()2.4s, \data_in\()3.4s -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - -// For comparability reasons, the output range for the coefficients of this -// invNTT code is supposed to match the implementation from PQClean on commit -// ee71d2c823982bfcf54686f3cf1d666f396dc9aa. After the invNTT, the coefficients -// are NOT canonically reduced. The ordering of the coefficients is canonical, -// also matching PQClean. - -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_clean) - - in .req x0 - r01234_ptr .req x1 - r56_ptr .req x2 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - consts .req v7 - q_consts .req q7 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - ninv .req v29 - ninv_tw .req v30 - -.balign 4 -MLKEM_ASM_NAMESPACE(intt_asm_clean): - push_stack - - // Setup constants - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - mov wtmp, #512 - dup ninv.8h, wtmp - mov wtmp, #5040 - dup ninv_tw.8h, wtmp - - mov inp, in - mov count, #8 - -scale_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - mul_ninv data0, data1, data2, data3, data0, data1, data2, data3 - // Bounds: Absolute value < q - - str q_data0, [inp], #64 - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, scale_start - - mov inp, in - mov count, #8 - - .p2align 2 -layer3456_start: - - ldr q_data0, [inp, #(16*0)] - ldr q_data1, [inp, #(16*1)] - ldr q_data2, [inp, #(16*2)] - ldr q_data3, [inp, #(16*3)] - - transpose4 data // manual ld4 - - load_next_roots_56 - - // Layer 7 - gs_butterfly_v data0, data1, root1, root1_tw - gs_butterfly_v data2, data3, root2, root2_tw - // Bounds: - // data0, data2: < 2q - // data1, data3: < q - - // Layer 6 - gs_butterfly_v data0, data2, root0, root0_tw - gs_butterfly_v data1, data3, root0, root0_tw - // Bounds: - // data0: < 4q - // data1: < 2q - // data2, data3: < q - - transpose4 data - - load_next_roots_34 - - // Layer 5 - gs_butterfly data0, data1, root0, 2, 3 - gs_butterfly data2, data3, root0, 4, 5 - // Max bound: 8q - - // Not all of those reductions are needed, but the bounds tracking - // is easier if we uniformly reduce at this point. - barrett_reduce data0 - barrett_reduce data2 - barrett_reduce data1 - barrett_reduce data3 - - // Bounds: q/2 - - // Layer 4 - gs_butterfly data0, data2, root0, 0, 1 - gs_butterfly data1, data3, root0, 0, 1 - // Bounds: < q - - str q_data0, [inp], #(64) - str q_data1, [inp, #(-64 + 16*1)] - str q_data2, [inp, #(-64 + 16*2)] - str q_data3, [inp, #(-64 + 16*3)] - - subs count, count, #1 - cbnz count, layer3456_start - - // --------------------------------------------------------------------- - - mov count, #4 - load_roots_012 - - .p2align 2 - -layer012_start: - - ldr q_data0, [in, #0] - ldr q_data1, [in, #(1*(512/8))] - ldr q_data2, [in, #(2*(512/8))] - ldr q_data3, [in, #(3*(512/8))] - ldr q_data4, [in, #(4*(512/8))] - ldr q_data5, [in, #(5*(512/8))] - ldr q_data6, [in, #(6*(512/8))] - ldr q_data7, [in, #(7*(512/8))] - - gs_butterfly data0, data1, root0, 6, 7 - gs_butterfly data2, data3, root1, 0, 1 - gs_butterfly data4, data5, root1, 2, 3 - gs_butterfly data6, data7, root1, 4, 5 - - gs_butterfly data0, data2, root0, 2, 3 - gs_butterfly data1, data3, root0, 2, 3 - gs_butterfly data4, data6, root0, 4, 5 - gs_butterfly data5, data7, root0, 4, 5 - - gs_butterfly data0, data4, root0, 0, 1 - gs_butterfly data1, data5, root0, 0, 1 - gs_butterfly data2, data6, root0, 0, 1 - gs_butterfly data3, data7, root0, 0, 1 - - // Bounds: < 8q - - str q_data4, [in, #(4*(512/8))] - str q_data5, [in, #(5*(512/8))] - str q_data6, [in, #(6*(512/8))] - str q_data7, [in, #(7*(512/8))] - - str q_data0, [in], #(16) - str q_data1, [in, #(-16 + 1*(512/8))] - str q_data2, [in, #(-16 + 2*(512/8))] - str q_data3, [in, #(-16 + 3*(512/8))] - - subs count, count, #1 - cbnz count, layer012_start - - pop_stack - ret - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq in - .unreq r01234_ptr - .unreq r56_ptr - .unreq inp - .unreq count - .unreq wtmp - .unreq data0 - .unreq data1 - .unreq data2 - .unreq data3 - .unreq data4 - .unreq data5 - .unreq data6 - .unreq data7 - .unreq q_data0 - .unreq q_data1 - .unreq q_data2 - .unreq q_data3 - .unreq q_data4 - .unreq q_data5 - .unreq q_data6 - .unreq q_data7 - .unreq root0 - .unreq root1 - .unreq root2 - .unreq root0_tw - .unreq root1_tw - .unreq root2_tw - .unreq consts - .unreq q_consts - .unreq q_root0 - .unreq q_root1 - .unreq q_root2 - .unreq q_root0_tw - .unreq q_root1_tw - .unreq q_root2_tw - .unreq tmp - .unreq t0 - .unreq t1 - .unreq t2 - .unreq t3 - .unreq ninv - .unreq ninv_tw - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S index 191de3c4d..0f9e44307 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/intt_opt.S @@ -25,6 +25,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // Bounds: // If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) @@ -139,9 +140,6 @@ // are NOT canonically reduced. The ordering of the coefficients is canonical, // also matching PQClean. -.text - .global MLKEM_ASM_NAMESPACE(intt_asm_opt) - in .req x0 r01234_ptr .req x1 r56_ptr .req x2 @@ -194,7 +192,9 @@ ninv .req v29 ninv_tw .req v30 -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(intt_asm_opt) + .balign 4 MLKEM_ASM_NAMESPACE(intt_asm_opt): push_stack @@ -1042,4 +1042,5 @@ layer012_start: .unreq ninv .unreq ninv_tw +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S deleted file mode 100644 index 4f844e212..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/ntt_clean.S +++ /dev/null @@ -1,317 +0,0 @@ -/// -/// Copyright (c) 2022 Arm Limited -/// Copyright (c) 2022 Hanno Becker -/// Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer -/// Copyright (c) 2024 The mlkem-native project authors -// SPDX-License-Identifier: MIT -/// -/// Permission is hereby granted, free of charge, to any person obtaining a copy -/// of this software and associated documentation files (the "Software"), to deal -/// in the Software without restriction, including without limitation the rights -/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -/// copies of the Software, and to permit persons to whom the Software is -/// furnished to do so, subject to the following conditions: -/// -/// The above copyright notice and this permission notice shall be included in all -/// copies or substantial portions of the Software. -/// -/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -/// SOFTWARE. -/// - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Bounds: -// If C is chosen so that |src| < q * C, then |dst| < q * (0.0508 * C + 1/2) -// -// See mlken/reduce.c and test/test_bounds.py for more details. -.macro mulmodq dst, src, const, idx0, idx1 - // Signed barrett multiplication using - // round-to-nearest-even-integer approximation. - // Following https://eprint.iacr.org/2021/986.pdf, this - // is functionally the same as a signed Montgomery multiplication - // with a suitable constant of absolute value < q. - sqrdmulh t2.8h, \src\().8h, \const\().h[\idx1\()] - mul \dst\().8h, \src\().8h, \const\().h[\idx0\()] - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro mulmod dst, src, const, const_twisted - sqrdmulh t2.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, t2.8h, consts.h[0] -.endm - -.macro ct_butterfly a, b, root, idx0, idx1 - mulmodq tmp, \b, \root, \idx0, \idx1 - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro ct_butterfly_v a, b, root, root_twisted - mulmod tmp, \b, \root, \root_twisted - sub \b\().8h, \a\().8h, tmp.8h - add \a\().8h, \a\().8h, tmp.8h -.endm - -.macro load_roots_012 - ldr q_root0, [r01234_ptr], #32 - ldr q_root1, [r01234_ptr, #-16] -.endm - -.macro load_next_roots_34 - ldr q_root0, [r01234_ptr], #16 -.endm - -.macro load_next_roots_56 - ldr q_root0, [r56_ptr], #(6*16) - ldr q_root0_tw, [r56_ptr, #(-6*16 + 1*16)] - ldr q_root1, [r56_ptr, #(-6*16 + 2*16)] - ldr q_root1_tw, [r56_ptr, #(-6*16 + 3*16)] - ldr q_root2, [r56_ptr, #(-6*16 + 4*16)] - ldr q_root2_tw, [r56_ptr, #(-6*16 + 5*16)] -.endm - -.macro transpose4 data - trn1 t0.4s, \data\()0.4s, \data\()1.4s - trn2 t1.4s, \data\()0.4s, \data\()1.4s - trn1 t2.4s, \data\()2.4s, \data\()3.4s - trn2 t3.4s, \data\()2.4s, \data\()3.4s - - trn2 \data\()2.2d, t0.2d, t2.2d - trn2 \data\()3.2d, t1.2d, t3.2d - trn1 \data\()0.2d, t0.2d, t2.2d - trn1 \data\()1.2d, t1.2d, t3.2d -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - // Arguments - in .req x0 // Input/output buffer - r01234_ptr .req x1 // twiddles for layer 0,1,2,3,4 - r56_ptr .req x2 // twiddles for layer 5,6 - - inp .req x3 - count .req x4 - wtmp .req w5 - - data0 .req v8 - data1 .req v9 - data2 .req v10 - data3 .req v11 - data4 .req v12 - data5 .req v13 - data6 .req v14 - data7 .req v15 - - q_data0 .req q8 - q_data1 .req q9 - q_data2 .req q10 - q_data3 .req q11 - q_data4 .req q12 - q_data5 .req q13 - q_data6 .req q14 - q_data7 .req q15 - - root0 .req v0 - root1 .req v1 - root2 .req v2 - root0_tw .req v4 - root1_tw .req v5 - root2_tw .req v6 - - q_root0 .req q0 - q_root1 .req q1 - q_root2 .req q2 - q_root0_tw .req q4 - q_root1_tw .req q5 - q_root2_tw .req q6 - - consts .req v7 - - tmp .req v24 - t0 .req v25 - t1 .req v26 - t2 .req v27 - t3 .req v28 - - .text - .global MLKEM_ASM_NAMESPACE(ntt_asm_clean) - - .balign 4 -MLKEM_ASM_NAMESPACE(ntt_asm_clean): - push_stack - - mov wtmp, #3329 - mov consts.h[0], wtmp - mov wtmp, #20159 - mov consts.h[1], wtmp - - mov inp, in - mov count, #4 - - load_roots_012 - - .p2align 2 - - // Bounds reasoning: - // - There are 7 layers - // - When passing from layer N to layer N+1, each layer-N value - // is modified through the addition/subtraction of a Montgomery - // product of a twiddle of absolute value < q/2 and a layer-N value. - // - Recalling that for C such that |a| < C * q and |t|> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_clean_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_clean): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 -poly_tomont_asm_loop: - - ldr q_data, [src], #64 - mulmod res, data, factor, factor_t - str q_res, [src, #-64] - - ldr q_data, [src, #-48] - mulmod res, data, factor, factor_t - str q_res, [src, #-48] - - ldr q_data, [src, #-32] - mulmod res, data, factor, factor_t - str q_res, [src, #-32] - - ldr q_data, [src, #-16] - mulmod res, data, factor, factor_t - str q_res, [src, #-16] - - sub count, count, #1 - cbnz count, poly_tomont_asm_loop - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S new file mode 100644 index 000000000..a3593b7fd --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_mulcache_compute_asm_opt.S @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/******************************************** + * poly_mulcache_compute() * + ********************************************/ + + + cache_ptr .req x0 + data_ptr .req x1 + zeta_ptr .req x2 + zeta_twisted_ptr .req x3 + count .req x4 + wtmp .req w5 + + data_odd .req v0 + zeta .req v1 + q_zeta .req q1 + zeta_twisted .req v2 + q_zeta_twisted .req q2 + + tmp0 .req v3 + q_tmp0 .req q3 + tmp1 .req v4 + q_tmp1 .req q4 + dst .req v5 + q_dst .req q5 + + modulus .req v6 + modulus_twisted .req v7 + + .text + .global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #20159 + dup modulus_twisted.8h, wtmp + + mov count, #16 + // Instructions: 7 + // Expected cycles: 12 + // Expected IPC: 0.58 + + // Cycle bound: 12.0 + // IPC bound: 0.58 + + // Wall time: 0.01s + // User time: 0.01s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q1, [x1, #16] // *............................. + ldr q27, [x1], #32 // ..*........................... + ldr q23, [x2], #16 // ....*......................... + uzp2 v27.8H, v27.8H, v1.8H // ......*....................... + ldr q1, [x3], #16 // .......*...................... + mul v2.8H, v27.8H, v23.8H // .........*.................... + sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q29, [x1, #16] // *.............................. + // ldr q21, [x2], #16 // ....*.......................... + // ldr q27, [x1], #32 // ..*............................ + // ldr q7, [x3], #16 // .......*....................... + // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ + // mul v2.8H, v28.8H, v21.8H // .........*..................... + // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... + + sub count, count, #1 +poly_mulcache_compute_asm_opt_loop: + // Instructions: 9 + // Expected cycles: 13 + // Expected IPC: 0.69 + + // Cycle bound: 13.0 + // IPC bound: 0.69 + + // Wall time: 0.09s + // User time: 0.09s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q29, [x1, #16] // e............................. + ldr q21, [x2], #16 // ..e........................... + mls v2.8H, v27.8H, v6.H[0] // ....*......................... + ldr q27, [x1], #32 // .....e........................ + ldr q7, [x3], #16 // .......e...................... + uzp2 v28.8H, v27.8H, v29.8H // .........e.................... + str q2, [x0], #16 // ..........*................... + mul v2.8H, v28.8H, v21.8H // ...........e.................. + sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q3, [x1], #32 // .....e.......'....~.......'.... + // ldr q4, [x1, #-16] // e............~............~.... + // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. + // ldr q2, [x3], #16 // .......e.....'......~.....'.... + // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... + // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... + // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... + // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... + // str q5, [x0], #16 // ..........~..'.........*..'.... + + sub count, count, 1 + cbnz count, poly_mulcache_compute_asm_opt_loop + // Instructions: 2 + // Expected cycles: 5 + // Expected IPC: 0.40 + + // Cycle bound: 5.0 + // IPC bound: 0.40 + + // Wall time: 0.00s + // User time: 0.00s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v2.8H, v27.8H, v6.H[0] // *............................. + str q2, [x0], #16 // ....*......................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v2.8H, v27.8H, v6.H[0] // *.............................. + // str q2, [x0], #16 // ....*.......................... + + + ret + + .unreq cache_ptr + .unreq data_ptr + .unreq zeta_ptr + .unreq zeta_twisted_ptr + .unreq count + .unreq wtmp + + .unreq data_odd + .unreq zeta + .unreq q_zeta + .unreq zeta_twisted + .unreq q_zeta_twisted + + .unreq tmp0 + .unreq q_tmp0 + .unreq tmp1 + .unreq q_tmp1 + .unreq dst + .unreq q_dst + + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S deleted file mode 100644 index 79605818f..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_opt.S +++ /dev/null @@ -1,670 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -/* - * Some modular arithmetic macros - */ - -/* Barrett reduction */ -.macro barrett_reduce a - sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] - srshr tmp.8h, tmp.8h, #11 - mls \a\().8h, tmp.8h, modulus.h[0] -.endm - -/* Montgomery multiplication, with precomputed Montgomery twist - * Expects modulus in consts.h[0]. */ -.macro mulmod dst, src, const, const_twisted - sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h - mul \dst\().8h, \src\().8h, \const\().8h - mls \dst\().8h, tmp0.8h, modulus.h[0] -.endm - -/* Turns signed-canonical to unsigned canonical representative - * through conditional addition of the modulus. - * - * Expected modulus in `modulus`. */ -.macro scalar_signed_to_unsigned a - sshr mask.8h, \a\().8h, #15 - and mask.16b, modulus.16b, mask.16b - add \a\().8h, \a\().8h, mask.8h -.endm - -/********************************** - * poly_reduce() * - **********************************/ - -.global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) - - ptr .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - - tmp .req v1 - mask .req v2 - modulus .req v3 - modulus_twisted .req v4 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov count, #8 - // Instructions: 15 - // Expected cycles: 22 - // Expected IPC: 0.68 - - // Cycle bound: 22.0 - // IPC bound: 0.68 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q21, [x0, #32] // *............................. - ldr q23, [x0, #48] // ..*........................... - sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... - sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... - srshr v7.8H, v7.8H, #11 // ........*..................... - srshr v30.8H, v30.8H, #11 // ..........*................... - mls v21.8H, v7.8H, v3.H[0] // ...........*.................. - mls v23.8H, v30.8H, v3.H[0] // .............*................ - ldr q5, [x0, #16] // ..............*............... - sshr v7.8H, v21.8H, #15 // ................*............. - sshr v30.8H, v23.8H, #15 // .................*............ - and v7.16B, v3.16B, v7.16B // ..................*........... - add v21.8H, v21.8H, v7.8H // ...................*.......... - and v7.16B, v3.16B, v30.16B // ....................*......... - add v16.8H, v23.8H, v7.8H // .....................*........ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q30, [x0, #32] // *.............................. - // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... - // ldr q2, [x0, #48] // ..*............................ - // srshr v19.8H, v22.8H, #11 // ........*...................... - // mls v30.8H, v19.8H, v3.H[0] // ...........*................... - // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ - // sshr v31.8H, v30.8H, #15 // ................*.............. - // srshr v25.8H, v25.8H, #11 // ..........*.................... - // and v18.16B, v3.16B, v31.16B // ..................*............ - // mls v2.8H, v25.8H, v3.H[0] // .............*................. - // add v21.8H, v30.8H, v18.8H // ...................*........... - // ldr q5, [x0, #16] // ..............*................ - // sshr v18.8H, v2.8H, #15 // .................*............. - // and v27.16B, v3.16B, v18.16B // ....................*.......... - // add v16.8H, v2.8H, v27.8H // .....................*......... - - sub count, count, #1 -1: - // Instructions: 32 - // Expected cycles: 36 - // Expected IPC: 0.89 - - // Cycle bound: 36.0 - // IPC bound: 0.89 - - // Wall time: 1.05s - // User time: 1.05s - - // -------- cycle (expected) ---------> - // 0 25 - // |------------------------|---------- - ldr q6, [x0], #64 // *................................... - ldr q30, [x0, #32] // ..e................................. - sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... - sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. - sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. - str q16, [x0, #-16] // .......*............................ - srshr v20.8H, v31.8H, #11 // ........*........................... - srshr v28.8H, v29.8H, #11 // .........*.......................... - str q21, [x0, #-32] // ..........*......................... - mls v6.8H, v20.8H, v3.H[0] // ...........*........................ - mls v5.8H, v28.8H, v3.H[0] // ............*....................... - ldr q2, [x0, #48] // .............e...................... - sshr v31.8H, v6.8H, #15 // ...............*.................... - srshr v19.8H, v22.8H, #11 // ................e................... - and v22.16B, v3.16B, v31.16B // .................*.................. - add v0.8H, v6.8H, v22.8H // ..................*................. - mls v30.8H, v19.8H, v3.H[0] // ...................e................ - sshr v26.8H, v5.8H, #15 // ....................*............... - sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. - and v17.16B, v3.16B, v26.16B // ......................*............. - add v1.8H, v5.8H, v17.8H // .......................*............ - sshr v31.8H, v30.8H, #15 // ........................e........... - srshr v25.8H, v25.8H, #11 // .........................e.......... - str q1, [x0, #-48] // ..........................*......... - and v18.16B, v3.16B, v31.16B // ...........................e........ - mls v2.8H, v25.8H, v3.H[0] // ............................e....... - add v21.8H, v30.8H, v18.8H // .............................e...... - ldr q5, [x0, #16] // ..............................e..... - sshr v18.8H, v2.8H, #15 // ................................e... - str q0, [x0, #-64] // .................................*.. - and v27.16B, v3.16B, v18.16B // ..................................e. - add v16.8H, v2.8H, v27.8H // ...................................e - - // ------------------------ cycle (expected) -------------------------> - // 0 25 50 - // |------------------------|------------------------|----------------- - // ldr q0, [x0], #64 // ..................................*................................. - // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. - // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... - // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... - // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. - // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ - // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... - // str q0, [x0, #-64] // ...............................~..'................................* - // ldr q0, [x0, #-48] // ............................e.....'.............................~... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ - // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ - // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... - // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. - // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... - // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... - // str q0, [x0, #-48] // ........................~.........'.........................*....... - // ldr q0, [x0, #-32] // e.................................'.~............................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... - // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. - // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. - // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... - // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... - // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... - // str q0, [x0, #-32] // ........~.........................'.........*....................... - // ldr q0, [x0, #-16] // ...........e......................'............~.................... - // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ - // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ - // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... - // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. - // and v2.16b, v3.16b, v2.16b // ................................e.'................................. - // add v0.8h, v0.8h, v2.8h // .................................e'................................. - // str q0, [x0, #-16] // .....~............................'......*.......................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 17 - // Expected cycles: 23 - // Expected IPC: 0.74 - - // Cycle bound: 23.0 - // IPC bound: 0.74 - - // Wall time: 0.05s - // User time: 0.05s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. - ldr q24, [x0], #64 // .*............................ - str q21, [x0, #-32] // ...*.......................... - srshr v20.8H, v20.8H, #11 // ....*......................... - sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ - str q16, [x0, #-16] // ......*....................... - mls v5.8H, v20.8H, v3.H[0] // .......*...................... - srshr v20.8H, v25.8H, #11 // .........*.................... - sshr v2.8H, v5.8H, #15 // ...........*.................. - mls v24.8H, v20.8H, v3.H[0] // ............*................. - and v20.16B, v3.16B, v2.16B // .............*................ - add v31.8H, v5.8H, v20.8H // ..............*............... - sshr v20.8H, v24.8H, #15 // ................*............. - str q31, [x0, #-48] // .................*............ - and v31.16B, v3.16B, v20.16B // ..................*........... - add v24.8H, v24.8H, v31.8H // ...................*.......... - str q24, [x0, #-64] // ......................*....... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q6, [x0], #64 // .*............................. - // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... - // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. - // str q16, [x0, #-16] // ......*........................ - // srshr v20.8H, v31.8H, #11 // .........*..................... - // srshr v28.8H, v29.8H, #11 // ....*.......................... - // str q21, [x0, #-32] // ...*........................... - // mls v6.8H, v20.8H, v3.H[0] // ............*.................. - // mls v5.8H, v28.8H, v3.H[0] // .......*....................... - // sshr v31.8H, v6.8H, #15 // ................*.............. - // and v22.16B, v3.16B, v31.16B // ..................*............ - // add v0.8H, v6.8H, v22.8H // ...................*........... - // sshr v26.8H, v5.8H, #15 // ...........*................... - // and v17.16B, v3.16B, v26.16B // .............*................. - // add v1.8H, v5.8H, v17.8H // ..............*................ - // str q1, [x0, #-48] // .................*............. - // str q0, [x0, #-64] // ......................*........ - - - ret - - .unreq ptr - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - - .unreq tmp - .unreq mask - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_mulcache_compute() * - ********************************************/ - -.global MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt) - - cache_ptr .req x0 - data_ptr .req x1 - zeta_ptr .req x2 - zeta_twisted_ptr .req x3 - count .req x4 - wtmp .req w5 - - data_odd .req v0 - zeta .req v1 - q_zeta .req q1 - zeta_twisted .req v2 - q_zeta_twisted .req q2 - - tmp0 .req v3 - q_tmp0 .req q3 - tmp1 .req v4 - q_tmp1 .req q4 - dst .req v5 - q_dst .req q5 - - modulus .req v6 - modulus_twisted .req v7 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_mulcache_compute_asm_opt): - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #20159 - dup modulus_twisted.8h, wtmp - - mov count, #16 - // Instructions: 7 - // Expected cycles: 12 - // Expected IPC: 0.58 - - // Cycle bound: 12.0 - // IPC bound: 0.58 - - // Wall time: 0.01s - // User time: 0.01s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q1, [x1, #16] // *............................. - ldr q27, [x1], #32 // ..*........................... - ldr q23, [x2], #16 // ....*......................... - uzp2 v27.8H, v27.8H, v1.8H // ......*....................... - ldr q1, [x3], #16 // .......*...................... - mul v2.8H, v27.8H, v23.8H // .........*.................... - sqrdmulh v27.8H, v27.8H, v1.8H // ...........*.................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q29, [x1, #16] // *.............................. - // ldr q21, [x2], #16 // ....*.......................... - // ldr q27, [x1], #32 // ..*............................ - // ldr q7, [x3], #16 // .......*....................... - // uzp2 v28.8H, v27.8H, v29.8H // ......*........................ - // mul v2.8H, v28.8H, v21.8H // .........*..................... - // sqrdmulh v27.8H, v28.8H, v7.8H // ...........*................... - - sub count, count, #1 -1: - // Instructions: 9 - // Expected cycles: 13 - // Expected IPC: 0.69 - - // Cycle bound: 13.0 - // IPC bound: 0.69 - - // Wall time: 0.09s - // User time: 0.09s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q29, [x1, #16] // e............................. - ldr q21, [x2], #16 // ..e........................... - mls v2.8H, v27.8H, v6.H[0] // ....*......................... - ldr q27, [x1], #32 // .....e........................ - ldr q7, [x3], #16 // .......e...................... - uzp2 v28.8H, v27.8H, v29.8H // .........e.................... - str q2, [x0], #16 // ..........*................... - mul v2.8H, v28.8H, v21.8H // ...........e.................. - sqrdmulh v27.8H, v28.8H, v7.8H // ............e................. - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q3, [x1], #32 // .....e.......'....~.......'.... - // ldr q4, [x1, #-16] // e............~............~.... - // ldr q1, [x2], #16 // ..e..........'.~..........'.~.. - // ldr q2, [x3], #16 // .......e.....'......~.....'.... - // uzp2 v0.8h, v3.8h, v4.8h // .........e...'........~...'.... - // sqrdmulh v3.8h, v0.8h, v2.8h // ............e'...........~'.... - // mul v5.8h, v0.8h, v1.8h // ...........e.'..........~.'.... - // mls v5.8h, v3.8h, v6.h[0] // ....~........'...*........'.... - // str q5, [x0], #16 // ..........~..'.........*..'.... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 2 - // Expected cycles: 5 - // Expected IPC: 0.40 - - // Cycle bound: 5.0 - // IPC bound: 0.40 - - // Wall time: 0.00s - // User time: 0.00s - - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v2.8H, v27.8H, v6.H[0] // *............................. - str q2, [x0], #16 // ....*......................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v2.8H, v27.8H, v6.H[0] // *.............................. - // str q2, [x0], #16 // ....*.......................... - - - ret - - .unreq cache_ptr - .unreq data_ptr - .unreq zeta_ptr - .unreq zeta_twisted_ptr - .unreq count - .unreq wtmp - - .unreq data_odd - .unreq zeta - .unreq q_zeta - .unreq zeta_twisted - .unreq q_zeta_twisted - - .unreq tmp0 - .unreq q_tmp0 - .unreq tmp1 - .unreq q_tmp1 - .unreq dst - .unreq q_dst - - .unreq modulus - .unreq modulus_twisted - -/******************************************** - * poly_tobytes() * - ********************************************/ -.global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) - - data0 .req v0 - data1 .req v1 - out0 .req v2 - out1 .req v3 - out2 .req v4 - tmp .req v5 - - dst .req x0 - src .req x1 - count .req x2 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): - - mov count, #16 -poly_tobytes_asm_opt_asm_loop_start: - ld2 {data0.8h, data1.8h}, [src], #32 - - // r[3 * i + 0] = (t0 >> 0); - xtn out0.8b, data0.8h - - // r[3 * i + 1] = (t0 >> 8); - shrn out1.8b, data0.8h, #8 - xtn tmp.8b, data1.8h - // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); - sli out1.8b, tmp.8b, #4 - - // r[3 * i + 2] = (t1 >> 4); - shrn out2.8b, data1.8h, #4 - - st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 - - subs count, count, #1 - cbnz count, poly_tobytes_asm_opt_asm_loop_start - ret - - .unreq data0 - .unreq data1 - .unreq out0 - .unreq out1 - .unreq out2 - .unreq tmp - .unreq dst - .unreq src - .unreq count - -/********************************** - * poly_tomont() * - **********************************/ -.global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) - - src .req x0 - count .req x1 - wtmp .req w2 - - data .req v0 - q_data .req q0 - res .req v1 - q_res .req q1 - - factor .req v2 - factor_t .req v3 - modulus .req v4 - modulus_twisted .req v5 - - tmp0 .req v6 - -.balign 4 -MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): - - mov wtmp, #3329 // ML-KEM modulus - dup modulus.8h, wtmp - - mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 - dup modulus_twisted.8h, wtmp - - mov wtmp, #-1044 // 2^16 % 3329 - dup factor.8h, wtmp - - mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) - dup factor_t.8h, wtmp - - mov count, #8 - // Instructions: 5 - // Expected cycles: 7 - // Expected IPC: 0.71 - // - // Cycle bound: 7.0 - // IPC bound: 0.71 - // - // Wall time: 0.01s - // User time: 0.01s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - ldr q26, [x0, #48] // *............................. - ldr q23, [x0, #16] // ..*........................... - mul v17.8H, v26.8H, v2.8H // ....*......................... - sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ - ldr q27, [x0, #32] // ......*....................... - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // ldr q7, [x0, #48] // *.............................. - // ldr q23, [x0, #16] // ..*............................ - // mul v17.8H, v7.8H, v2.8H // ....*.......................... - // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... - // ldr q27, [x0, #32] // ......*........................ - - sub count, count, #1 -1: - // Instructions: 20 - // Expected cycles: 24 - // Expected IPC: 0.83 - // - // Cycle bound: 24.0 - // IPC bound: 0.83 - // - // Wall time: 0.73s - // User time: 0.73s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ - ldr q7, [x0], #64 // ..*........................... - str q17, [x0, #-16] // ....*......................... - sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ - sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... - mul v25.8H, v23.8H, v2.8H // .......*...................... - mul v0.8H, v7.8H, v2.8H // ........*..................... - mul v26.8H, v27.8H, v2.8H // .........*.................... - ldr q7, [x0, #48] // ..........e................... - mls v25.8H, v5.8H, v4.H[0] // ............*................. - ldr q23, [x0, #16] // .............e................ - mls v26.8H, v29.8H, v4.H[0] // ...............*.............. - mls v0.8H, v19.8H, v4.H[0] // ................*............. - str q25, [x0, #-48] // .................*............ - mul v17.8H, v7.8H, v2.8H // ..................e........... - sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... - str q0, [x0, #-64] // ....................*......... - ldr q27, [x0, #32] // .....................e........ - str q26, [x0, #-32] // .......................*...... - - // --------- cycle (expected) ----------> - // 0 25 - // |------------------------|------------ - // ldr q0, [x0], #64 // ..............'.*..................... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. - // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... - // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... - // str q1, [x0, #-64] // ..........~...'...................*... - // ldr q0, [x0, #-48] // ...e..........'............~.......... - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... - // mul v1.8h, v0.8h, v2.8h // ..............'......*................ - // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... - // str q1, [x0, #-48] // .......~......'................*...... - // ldr q0, [x0, #-32] // ...........e..'....................~.. - // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. - // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. - // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ - // str q1, [x0, #-32] // .............~'......................* - // ldr q0, [x0, #-16] // e.............'.........~............. - // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... - // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... - // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... - // str q1, [x0, #-16] // ..............'...*................... - - sub count, count, 1 - cbnz count, 1b - // Instructions: 15 - // Expected cycles: 18 - // Expected IPC: 0.83 - // - // Cycle bound: 18.0 - // IPC bound: 0.83 - // - // Wall time: 0.07s - // User time: 0.07s - // - // ----- cycle (expected) ------> - // 0 25 - // |------------------------|---- - mls v17.8H, v7.8H, v4.H[0] // *............................. - sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ - mul v26.8H, v23.8H, v2.8H // ..*........................... - sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... - ldr q23, [x0], #64 // ....*......................... - mul v27.8H, v27.8H, v2.8H // ......*....................... - mls v26.8H, v7.8H, v4.H[0] // .......*...................... - sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... - mul v23.8H, v23.8H, v2.8H // .........*.................... - str q17, [x0, #-16] // ..........*................... - mls v27.8H, v25.8H, v4.H[0] // ...........*.................. - str q26, [x0, #-48] // ............*................. - mls v23.8H, v7.8H, v4.H[0] // .............*................ - str q27, [x0, #-32] // ...............*.............. - str q23, [x0, #-64] // .................*............ - - // ------ cycle (expected) ------> - // 0 25 - // |------------------------|----- - // mls v17.8H, v7.8H, v4.H[0] // *.............................. - // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. - // ldr q7, [x0], #64 // ....*.......................... - // str q17, [x0, #-16] // ..........*.................... - // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... - // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... - // mul v25.8H, v23.8H, v2.8H // ..*............................ - // mul v0.8H, v7.8H, v2.8H // .........*..................... - // mul v26.8H, v27.8H, v2.8H // ......*........................ - // mls v25.8H, v5.8H, v4.H[0] // .......*....................... - // mls v26.8H, v29.8H, v4.H[0] // ...........*................... - // mls v0.8H, v19.8H, v4.H[0] // .............*................. - // str q25, [x0, #-48] // ............*.................. - // str q0, [x0, #-64] // .................*............. - // str q26, [x0, #-32] // ...............*............... - - - ret - - .unreq src - .unreq count - .unreq wtmp - - .unreq data - .unreq q_data - .unreq res - .unreq q_res - - .unreq factor - .unreq factor_t - .unreq modulus - .unreq modulus_twisted - - .unreq tmp0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S new file mode 100644 index 000000000..410950730 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_reduce_asm_opt.S @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Barrett reduction */ +.macro barrett_reduce a + sqdmulh tmp.8h, \a\().8h, modulus_twisted.h[0] + srshr tmp.8h, tmp.8h, #11 + mls \a\().8h, tmp.8h, modulus.h[0] +.endm + +/* Turns signed-canonical to unsigned canonical representative + * through conditional addition of the modulus. + * + * Expected modulus in `modulus`. */ +.macro scalar_signed_to_unsigned a + sshr mask.8h, \a\().8h, #15 + and mask.16b, modulus.16b, mask.16b + add \a\().8h, \a\().8h, mask.8h +.endm + +/********************************** + * poly_reduce() * + **********************************/ + + ptr .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + + tmp .req v1 + mask .req v2 + modulus .req v3 + modulus_twisted .req v4 + + .text + .global MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_reduce_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov count, #8 + // Instructions: 15 + // Expected cycles: 22 + // Expected IPC: 0.68 + + // Cycle bound: 22.0 + // IPC bound: 0.68 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q21, [x0, #32] // *............................. + ldr q23, [x0, #48] // ..*........................... + sqdmulh v7.8H, v21.8H, v4.H[0] // ....*......................... + sqdmulh v30.8H, v23.8H, v4.H[0] // ......*....................... + srshr v7.8H, v7.8H, #11 // ........*..................... + srshr v30.8H, v30.8H, #11 // ..........*................... + mls v21.8H, v7.8H, v3.H[0] // ...........*.................. + mls v23.8H, v30.8H, v3.H[0] // .............*................ + ldr q5, [x0, #16] // ..............*............... + sshr v7.8H, v21.8H, #15 // ................*............. + sshr v30.8H, v23.8H, #15 // .................*............ + and v7.16B, v3.16B, v7.16B // ..................*........... + add v21.8H, v21.8H, v7.8H // ...................*.......... + and v7.16B, v3.16B, v30.16B // ....................*......... + add v16.8H, v23.8H, v7.8H // .....................*........ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q30, [x0, #32] // *.............................. + // sqdmulh v22.8H, v30.8H, v4.H[0] // ....*.......................... + // ldr q2, [x0, #48] // ..*............................ + // srshr v19.8H, v22.8H, #11 // ........*...................... + // mls v30.8H, v19.8H, v3.H[0] // ...........*................... + // sqdmulh v25.8H, v2.8H, v4.H[0] // ......*........................ + // sshr v31.8H, v30.8H, #15 // ................*.............. + // srshr v25.8H, v25.8H, #11 // ..........*.................... + // and v18.16B, v3.16B, v31.16B // ..................*............ + // mls v2.8H, v25.8H, v3.H[0] // .............*................. + // add v21.8H, v30.8H, v18.8H // ...................*........... + // ldr q5, [x0, #16] // ..............*................ + // sshr v18.8H, v2.8H, #15 // .................*............. + // and v27.16B, v3.16B, v18.16B // ....................*.......... + // add v16.8H, v2.8H, v27.8H // .....................*......... + + sub count, count, #1 +poly_reduce_asm_opt_loop: + // Instructions: 32 + // Expected cycles: 36 + // Expected IPC: 0.89 + + // Cycle bound: 36.0 + // IPC bound: 0.89 + + // Wall time: 1.05s + // User time: 1.05s + + // -------- cycle (expected) ---------> + // 0 25 + // |------------------------|---------- + ldr q6, [x0], #64 // *................................... + ldr q30, [x0, #32] // ..e................................. + sqdmulh v31.8H, v6.8H, v4.H[0] // ....*............................... + sqdmulh v29.8H, v5.8H, v4.H[0] // .....*.............................. + sqdmulh v22.8H, v30.8H, v4.H[0] // ......e............................. + str q16, [x0, #-16] // .......*............................ + srshr v20.8H, v31.8H, #11 // ........*........................... + srshr v28.8H, v29.8H, #11 // .........*.......................... + str q21, [x0, #-32] // ..........*......................... + mls v6.8H, v20.8H, v3.H[0] // ...........*........................ + mls v5.8H, v28.8H, v3.H[0] // ............*....................... + ldr q2, [x0, #48] // .............e...................... + sshr v31.8H, v6.8H, #15 // ...............*.................... + srshr v19.8H, v22.8H, #11 // ................e................... + and v22.16B, v3.16B, v31.16B // .................*.................. + add v0.8H, v6.8H, v22.8H // ..................*................. + mls v30.8H, v19.8H, v3.H[0] // ...................e................ + sshr v26.8H, v5.8H, #15 // ....................*............... + sqdmulh v25.8H, v2.8H, v4.H[0] // .....................e.............. + and v17.16B, v3.16B, v26.16B // ......................*............. + add v1.8H, v5.8H, v17.8H // .......................*............ + sshr v31.8H, v30.8H, #15 // ........................e........... + srshr v25.8H, v25.8H, #11 // .........................e.......... + str q1, [x0, #-48] // ..........................*......... + and v18.16B, v3.16B, v31.16B // ...........................e........ + mls v2.8H, v25.8H, v3.H[0] // ............................e....... + add v21.8H, v30.8H, v18.8H // .............................e...... + ldr q5, [x0, #16] // ..............................e..... + sshr v18.8H, v2.8H, #15 // ................................e... + str q0, [x0, #-64] // .................................*.. + and v27.16B, v3.16B, v18.16B // ..................................e. + add v16.8H, v2.8H, v27.8H // ...................................e + + // ------------------------ cycle (expected) -------------------------> + // 0 25 50 + // |------------------------|------------------------|----------------- + // ldr q0, [x0], #64 // ..................................*................................. + // sqdmulh v1.8h, v0.8h, v4.h[0] // ..~...............................'...*............................. + // srshr v1.8h, v1.8h, #11 // ......~...........................'.......*......................... + // mls v0.8h, v1.8h, v3.h[0] // .........~........................'..........*...................... + // sshr v2.8h, v0.8h, #15 // .............~....................'..............*.................. + // and v2.16b, v3.16b, v2.16b // ...............~..................'................*................ + // add v0.8h, v0.8h, v2.8h // ................~.................'.................*............... + // str q0, [x0, #-64] // ...............................~..'................................* + // ldr q0, [x0, #-48] // ............................e.....'.............................~... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...~..............................'....*............................ + // srshr v1.8h, v1.8h, #11 // .......~..........................'........*........................ + // mls v0.8h, v1.8h, v3.h[0] // ..........~.......................'...........*..................... + // sshr v2.8h, v0.8h, #15 // ..................~...............'...................*............. + // and v2.16b, v3.16b, v2.16b // ....................~.............'.....................*........... + // add v0.8h, v0.8h, v2.8h // .....................~............'......................*.......... + // str q0, [x0, #-48] // ........................~.........'.........................*....... + // ldr q0, [x0, #-32] // e.................................'.~............................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ....e.............................'.....~........................... + // srshr v1.8h, v1.8h, #11 // ..............e...................'...............~................. + // mls v0.8h, v1.8h, v3.h[0] // .................e................'..................~.............. + // sshr v2.8h, v0.8h, #15 // ......................e...........'.......................~......... + // and v2.16b, v3.16b, v2.16b // .........................e........'..........................~...... + // add v0.8h, v0.8h, v2.8h // ...........................e......'............................~.... + // str q0, [x0, #-32] // ........~.........................'.........*....................... + // ldr q0, [x0, #-16] // ...........e......................'............~.................... + // sqdmulh v1.8h, v0.8h, v4.h[0] // ...................e..............'....................~............ + // srshr v1.8h, v1.8h, #11 // .......................e..........'........................~........ + // mls v0.8h, v1.8h, v3.h[0] // ..........................e.......'...........................~..... + // sshr v2.8h, v0.8h, #15 // ..............................e...'...............................~. + // and v2.16b, v3.16b, v2.16b // ................................e.'................................. + // add v0.8h, v0.8h, v2.8h // .................................e'................................. + // str q0, [x0, #-16] // .....~............................'......*.......................... + + sub count, count, 1 + cbnz count, poly_reduce_asm_opt_loop + // Instructions: 17 + // Expected cycles: 23 + // Expected IPC: 0.74 + + // Cycle bound: 23.0 + // IPC bound: 0.74 + + // Wall time: 0.05s + // User time: 0.05s + + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + sqdmulh v20.8H, v5.8H, v4.H[0] // *............................. + ldr q24, [x0], #64 // .*............................ + str q21, [x0, #-32] // ...*.......................... + srshr v20.8H, v20.8H, #11 // ....*......................... + sqdmulh v25.8H, v24.8H, v4.H[0] // .....*........................ + str q16, [x0, #-16] // ......*....................... + mls v5.8H, v20.8H, v3.H[0] // .......*...................... + srshr v20.8H, v25.8H, #11 // .........*.................... + sshr v2.8H, v5.8H, #15 // ...........*.................. + mls v24.8H, v20.8H, v3.H[0] // ............*................. + and v20.16B, v3.16B, v2.16B // .............*................ + add v31.8H, v5.8H, v20.8H // ..............*............... + sshr v20.8H, v24.8H, #15 // ................*............. + str q31, [x0, #-48] // .................*............ + and v31.16B, v3.16B, v20.16B // ..................*........... + add v24.8H, v24.8H, v31.8H // ...................*.......... + str q24, [x0, #-64] // ......................*....... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q6, [x0], #64 // .*............................. + // sqdmulh v31.8H, v6.8H, v4.H[0] // .....*......................... + // sqdmulh v29.8H, v5.8H, v4.H[0] // *.............................. + // str q16, [x0, #-16] // ......*........................ + // srshr v20.8H, v31.8H, #11 // .........*..................... + // srshr v28.8H, v29.8H, #11 // ....*.......................... + // str q21, [x0, #-32] // ...*........................... + // mls v6.8H, v20.8H, v3.H[0] // ............*.................. + // mls v5.8H, v28.8H, v3.H[0] // .......*....................... + // sshr v31.8H, v6.8H, #15 // ................*.............. + // and v22.16B, v3.16B, v31.16B // ..................*............ + // add v0.8H, v6.8H, v22.8H // ...................*........... + // sshr v26.8H, v5.8H, #15 // ...........*................... + // and v17.16B, v3.16B, v26.16B // .............*................. + // add v1.8H, v5.8H, v17.8H // ..............*................ + // str q1, [x0, #-48] // .................*............. + // str q0, [x0, #-64] // ......................*........ + + + ret + + .unreq ptr + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + + .unreq tmp + .unreq mask + .unreq modulus + .unreq modulus_twisted + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S new file mode 100644 index 000000000..bc33afd43 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tobytes_asm_opt.S @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/******************************************** + * poly_tobytes() * + ********************************************/ + + data0 .req v0 + data1 .req v1 + out0 .req v2 + out1 .req v3 + out2 .req v4 + tmp .req v5 + + dst .req x0 + src .req x1 + count .req x2 + + .text + .global MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tobytes_asm_opt): + + mov count, #16 +poly_tobytes_asm_opt_asm_loop_start: + ld2 {data0.8h, data1.8h}, [src], #32 + + // r[3 * i + 0] = (t0 >> 0); + xtn out0.8b, data0.8h + + // r[3 * i + 1] = (t0 >> 8); + shrn out1.8b, data0.8h, #8 + xtn tmp.8b, data1.8h + // r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + sli out1.8b, tmp.8b, #4 + + // r[3 * i + 2] = (t1 >> 4); + shrn out2.8b, data1.8h, #4 + + st3 {out0.8b, out1.8b, out2.8b}, [dst], #24 + + subs count, count, #1 + cbnz count, poly_tobytes_asm_opt_asm_loop_start + ret + + .unreq data0 + .unreq data1 + .unreq out0 + .unreq out1 + .unreq out2 + .unreq tmp + .unreq dst + .unreq src + .unreq count + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S new file mode 100644 index 000000000..bcbff9adb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/poly_tomont_asm_opt.S @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ + +/* Montgomery multiplication, with precomputed Montgomery twist + * Expects modulus in consts.h[0]. */ +.macro mulmod dst, src, const, const_twisted + sqrdmulh tmp0.8h, \src\().8h, \const_twisted\().8h + mul \dst\().8h, \src\().8h, \const\().8h + mls \dst\().8h, tmp0.8h, modulus.h[0] +.endm + +/********************************** + * poly_tomont() * + **********************************/ + + src .req x0 + count .req x1 + wtmp .req w2 + + data .req v0 + q_data .req q0 + res .req v1 + q_res .req q1 + + factor .req v2 + factor_t .req v3 + modulus .req v4 + modulus_twisted .req v5 + + tmp0 .req v6 + + + .text + .global MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(poly_tomont_asm_opt): + + mov wtmp, #3329 // ML-KEM modulus + dup modulus.8h, wtmp + + mov wtmp, #20159 // Barrett twist of 1 wrt 2^27 + dup modulus_twisted.8h, wtmp + + mov wtmp, #-1044 // 2^16 % 3329 + dup factor.8h, wtmp + + mov wtmp, #-10276 // Barrett twist of -1044 (wrt 2^16) + dup factor_t.8h, wtmp + + mov count, #8 + // Instructions: 5 + // Expected cycles: 7 + // Expected IPC: 0.71 + // + // Cycle bound: 7.0 + // IPC bound: 0.71 + // + // Wall time: 0.01s + // User time: 0.01s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q26, [x0, #48] // *............................. + ldr q23, [x0, #16] // ..*........................... + mul v17.8H, v26.8H, v2.8H // ....*......................... + sqrdmulh v7.8H, v26.8H, v3.8H // .....*........................ + ldr q27, [x0, #32] // ......*....................... + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q7, [x0, #48] // *.............................. + // ldr q23, [x0, #16] // ..*............................ + // mul v17.8H, v7.8H, v2.8H // ....*.......................... + // sqrdmulh v7.8H, v7.8H, v3.8H // .....*......................... + // ldr q27, [x0, #32] // ......*........................ + + sub count, count, #1 +poly_tomont_asm_opt_loop: + // Instructions: 20 + // Expected cycles: 24 + // Expected IPC: 0.83 + // + // Cycle bound: 24.0 + // IPC bound: 0.83 + // + // Wall time: 0.73s + // User time: 0.73s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v5.8H, v23.8H, v3.8H // .*............................ + ldr q7, [x0], #64 // ..*........................... + str q17, [x0, #-16] // ....*......................... + sqrdmulh v29.8H, v27.8H, v3.8H // .....*........................ + sqrdmulh v19.8H, v7.8H, v3.8H // ......*....................... + mul v25.8H, v23.8H, v2.8H // .......*...................... + mul v0.8H, v7.8H, v2.8H // ........*..................... + mul v26.8H, v27.8H, v2.8H // .........*.................... + ldr q7, [x0, #48] // ..........e................... + mls v25.8H, v5.8H, v4.H[0] // ............*................. + ldr q23, [x0, #16] // .............e................ + mls v26.8H, v29.8H, v4.H[0] // ...............*.............. + mls v0.8H, v19.8H, v4.H[0] // ................*............. + str q25, [x0, #-48] // .................*............ + mul v17.8H, v7.8H, v2.8H // ..................e........... + sqrdmulh v7.8H, v7.8H, v3.8H // ...................e.......... + str q0, [x0, #-64] // ....................*......... + ldr q27, [x0, #32] // .....................e........ + str q26, [x0, #-32] // .......................*...... + + // --------- cycle (expected) ----------> + // 0 25 + // |------------------------|------------ + // ldr q0, [x0], #64 // ..............'.*..................... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'.....*................. + // mul v1.8h, v0.8h, v2.8h // ..............'.......*............... + // mls v1.8h, v6.8h, v4.h[0] // ......~.......'...............*....... + // str q1, [x0, #-64] // ..........~...'...................*... + // ldr q0, [x0, #-48] // ...e..........'............~.......... + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'*...................... + // mul v1.8h, v0.8h, v2.8h // ..............'......*................ + // mls v1.8h, v6.8h, v4.h[0] // ..~...........'...........*........... + // str q1, [x0, #-48] // .......~......'................*...... + // ldr q0, [x0, #-32] // ...........e..'....................~.. + // sqrdmulh v6.8h, v0.8h, v3.8h // ..............'....*.................. + // mul v1.8h, v0.8h, v2.8h // ..............'........*.............. + // mls v1.8h, v6.8h, v4.h[0] // .....~........'..............*........ + // str q1, [x0, #-32] // .............~'......................* + // ldr q0, [x0, #-16] // e.............'.........~............. + // sqrdmulh v6.8h, v0.8h, v3.8h // .........e....'..................~.... + // mul v1.8h, v0.8h, v2.8h // ........e.....'.................~..... + // mls v1.8h, v6.8h, v4.h[0] // ..............*....................... + // str q1, [x0, #-16] // ..............'...*................... + + sub count, count, 1 + cbnz count, poly_tomont_asm_opt_loop + // Instructions: 15 + // Expected cycles: 18 + // Expected IPC: 0.83 + // + // Cycle bound: 18.0 + // IPC bound: 0.83 + // + // Wall time: 0.07s + // User time: 0.07s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + mls v17.8H, v7.8H, v4.H[0] // *............................. + sqrdmulh v7.8H, v23.8H, v3.8H // .*............................ + mul v26.8H, v23.8H, v2.8H // ..*........................... + sqrdmulh v25.8H, v27.8H, v3.8H // ...*.......................... + ldr q23, [x0], #64 // ....*......................... + mul v27.8H, v27.8H, v2.8H // ......*....................... + mls v26.8H, v7.8H, v4.H[0] // .......*...................... + sqrdmulh v7.8H, v23.8H, v3.8H // ........*..................... + mul v23.8H, v23.8H, v2.8H // .........*.................... + str q17, [x0, #-16] // ..........*................... + mls v27.8H, v25.8H, v4.H[0] // ...........*.................. + str q26, [x0, #-48] // ............*................. + mls v23.8H, v7.8H, v4.H[0] // .............*................ + str q27, [x0, #-32] // ...............*.............. + str q23, [x0, #-64] // .................*............ + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // mls v17.8H, v7.8H, v4.H[0] // *.............................. + // sqrdmulh v5.8H, v23.8H, v3.8H // .*............................. + // ldr q7, [x0], #64 // ....*.......................... + // str q17, [x0, #-16] // ..........*.................... + // sqrdmulh v29.8H, v27.8H, v3.8H // ...*........................... + // sqrdmulh v19.8H, v7.8H, v3.8H // ........*...................... + // mul v25.8H, v23.8H, v2.8H // ..*............................ + // mul v0.8H, v7.8H, v2.8H // .........*..................... + // mul v26.8H, v27.8H, v2.8H // ......*........................ + // mls v25.8H, v5.8H, v4.H[0] // .......*....................... + // mls v26.8H, v29.8H, v4.H[0] // ...........*................... + // mls v0.8H, v19.8H, v4.H[0] // .............*................. + // str q25, [x0, #-48] // ............*.................. + // str q0, [x0, #-64] // .................*............. + // str q26, [x0, #-32] // ...............*............... + + + ret + + .unreq src + .unreq count + .unreq wtmp + + .unreq data + .unreq q_data + .unreq res + .unreq q_res + + .unreq factor + .unreq factor_t + .unreq modulus + .unreq modulus_twisted + + .unreq tmp0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S new file mode 100644 index 000000000..e336b92cb --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2_opt.S @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 2 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2_opt): + push_stack + + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 94 + // Expected IPC: 0.80 + + // Cycle bound: 94.0 + // IPC bound: 0.80 + + // Wall time: 1.49s + // User time: 1.49s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q9, [x4], #32 // *.......................................................................... + ldr q5, [x4, #-16] // ......*.................................................................... + ldr q11, [x5], #32 // .*......................................................................... + uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. + uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... + ldr q5, [x2], #32 // ..*........................................................................ + ldr q7, [x5, #-16] // ..............*............................................................ + ldr q21, [x2, #-16] // ...*....................................................................... + uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... + uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ + uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... + uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... + ldr q21, [x1], #32 // .......*................................................................... + ldr q25, [x1, #-16] // ........*.................................................................. + ld1 {v6.8H}, [x3], #16 // ............................*.............................................. + uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ + uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... + smull v25.4S, v26.4H, v5.4H // ............*.............................................................. + smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. + smull v19.4S, v26.4H, v7.4H // ..........................*................................................ + smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ + smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... + smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... + smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... + smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... + smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... + smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... + smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... + smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... + ld1 {v23.8H}, [x6], #16 // ........................*.................................................. + smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... + smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... + smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... + smlal v19.4S, v9.4H, v23.4H // .........................................*................................. + ldr q9, [x4], #32 // ...............................*........................................... + uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. + uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. + mul v11.8H, v11.8H, v2.8H // ...........................*............................................... + mul v23.8H, v23.8H, v2.8H // ..............................................*............................ + ldr q7, [x5], #32 // ................................*.......................................... + smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. + smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ + ldr q11, [x2], #32 // .....................................*..................................... + ldr q21, [x2, #-16] // ........................................*.................................. + ldr q6, [x4, #-16] // ...............................................*........................... + uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... + ldr q10, [x1], #32 // ................................................*.......................... + ldr q29, [x1, #-16] // .................................................*......................... + uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. + uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... + uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... + uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... + smull v12.4S, v3.4H, v11.4H // ......................................................*.................... + smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... + ldr q21, [x5, #-16] // ........................................................*.................. + smlal v12.4S, v10.4H, v17.4H // .........................................................*................. + smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ + uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... + uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. + smlal v12.4S, v13.4H, v29.4H // .............................................................*............. + smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ + uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... + smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ + smlal v12.4S, v28.4H, v15.4H // .................................................................*......... + smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ + smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... + uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ + smull v23.4S, v3.4H, v17.4H // ......................................................................*.... + uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... + uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... + mul v14.8H, v9.8H, v2.8H // .......................................................................*... + ld1 {v22.8H}, [x6], #16 // ...................................................................*....... + zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* + ld1 {v4.8H}, [x3], #16 // .........................................................................*. + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q18, [x4], #32 // *.......................................................................... + // ldr q30, [x5], #32 // ..*........................................................................ + // ldr q8, [x2], #32 // .....*..................................................................... + // ldr q9, [x2, #-16] // .......*................................................................... + // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ + // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... + // ldr q19, [x4, #-16] // .*......................................................................... + // ldr q29, [x1], #32 // ............*.............................................................. + // ldr q12, [x1, #-16] // .............*............................................................. + // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... + // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... + // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... + // smull v12.4S, v3.4H, v4.4H // .................*......................................................... + // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ + // ldr q5, [x5, #-16] // ......*.................................................................... + // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... + // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... + // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. + // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. + // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. + // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ + // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... + // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ + // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... + // ld1 {v22.8H}, [x6], #16 // .............................*............................................. + // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... + // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... + // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... + // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ + // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. + // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... + // ldr q18, [x4], #32 // ..................................*........................................ + // ldr q30, [x5], #32 // .......................................*................................... + // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. + // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. + // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... + // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. + // ldr q8, [x2], #32 // ..........................................*................................ + // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... + // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... + // ldr q9, [x2, #-16] // ...........................................*............................... + // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... + // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ + // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. + // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... + // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... + // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... + // ldr q19, [x4, #-16] // ............................................*.............................. + // ldr q29, [x1], #32 // ..............................................*............................ + // ldr q12, [x1, #-16] // ...............................................*........................... + // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ + // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... + // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ + // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... + // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... + // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... + // ldr q5, [x5, #-16] // ......................................................*.................... + // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... + // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. + // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. + // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ + // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... + // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. + // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. + // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... + // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... + // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... + // ld1 {v22.8H}, [x6], #16 // .......................................................................*... + // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... + // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... + // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... + // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... + // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. + // ld1 {v4.8H}, [x3], #16 // ..........................................................................* + // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop: + // Instructions: 48 + // Expected cycles: 58 + // Expected IPC: 0.83 + + // Cycle bound: 58.0 + // IPC bound: 0.83 + + // Wall time: 6.39s + // User time: 6.39s + + // -------------- original position --------------> + // 0 25 + // |------------------------|---------------------- + smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... + ldr q18, [x4], #32 // .................e.............................. + ldr q30, [x5], #32 // .....................e.......................... + smlal2 v20.4S, v10.8H, v4.8H // ............*................................... + smlal v12.4S, v14.4H, v0.4H // .........................................*...... + smlal v23.4S, v10.4H, v4.4H // ...........*.................................... + str q9, [x0, #16] // ...............................................l + smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... + ldr q8, [x2], #32 // ....e........................................... + smlal v23.4S, v13.4H, v15.4H // ..........................*..................... + smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. + zip1 v26.8H, v19.8H, v27.8H // ............................................l... + ldr q9, [x2, #-16] // .....e.......................................... + smlal v23.4S, v28.4H, v22.4H // ............................*................... + uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... + uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... + uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ + uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. + str q26, [x0], #32 // ..............................................l. + mul v31.8H, v5.8H, v2.8H // ...................................*............ + ldr q19, [x4, #-16] // ..................e............................. + ldr q29, [x1], #32 // e............................................... + ldr q12, [x1, #-16] // .e.............................................. + smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... + uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ + uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. + uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ + smull v12.4S, v3.4H, v4.4H // .............e.................................. + smull2 v11.4S, v3.8H, v4.8H // ..............e................................. + ldr q5, [x5, #-16] // ......................e......................... + smlal v12.4S, v10.4H, v17.4H // ...............e................................ + smlal2 v11.4S, v10.8H, v17.8H // ................e............................... + uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... + uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ + smlal v12.4S, v13.4H, v14.4H // ..............................e................. + smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ + uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... + smlal v23.4S, v31.4H, v0.4H // ....................................*........... + smlal v12.4S, v28.4H, v15.4H // ................................e............... + smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. + ld1 {v22.8H}, [x6], #16 // .........................e...................... + uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... + uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ + smull v23.4S, v3.4H, v17.4H // .........e...................................... + mul v14.8H, v1.8H, v2.8H // ........................................e....... + zip2 v9.8H, v19.8H, v27.8H // .............................................*.. + ld1 {v4.8H}, [x3], #16 // ........e....................................... + smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. + // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. + // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... + // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... + // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... + // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. + // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. + // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. + // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. + // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... + // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. + // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. + // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. + // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. + // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. + // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. + // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. + // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ + // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. + // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. + // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. + // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. + // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... + // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... + // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... + // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ + // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. + // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. + // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. + // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. + // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. + // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. + // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. + // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. + // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. + // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. + // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. + // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. + // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. + // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... + // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... + // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. + // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l + // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k2_opt_loop + // Instructions: 21 + // Expected cycles: 35 + // Expected IPC: 0.60 + + // Cycle bound: 35.0 + // IPC bound: 0.60 + + // Wall time: 0.08s + // User time: 0.08s + + // ----- original position -----> + // 0 25 + // |------------------------|---- + smull2 v5.4S, v3.8H, v17.8H // *............................. + smlal v12.4S, v14.4H, v0.4H // ..*........................... + smlal v23.4S, v10.4H, v4.4H // ...*.......................... + str q9, [x0, #16] // ....*......................... + smlal2 v5.4S, v10.8H, v4.8H // .*............................ + uzp2 v11.8H, v12.8H, v11.8H // ..........*................... + zip1 v9.8H, v19.8H, v27.8H // ........*..................... + smlal v23.4S, v13.4H, v15.4H // ......*....................... + smlal2 v5.4S, v13.8H, v15.8H // .....*........................ + str q9, [x0], #32 // ............*................. + smlal v23.4S, v28.4H, v22.4H // .........*.................... + smlal2 v5.4S, v28.8H, v22.8H // .......*...................... + uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. + mul v9.8H, v9.8H, v2.8H // .............*................ + smlal2 v5.4S, v9.8H, v0.8H // ..............*............... + smlal v23.4S, v9.4H, v0.4H // ...............*.............. + uzp2 v9.8H, v23.8H, v5.8H // ................*............. + zip2 v5.8H, v9.8H, v11.8H // .................*............ + zip1 v9.8H, v9.8H, v11.8H // ...................*.......... + str q5, [x0, #16] // ..................*........... + str q9, [x0], #32 // ....................*......... + + // -------- new position --------> + // 0 25 + // |------------------------|----- + // smull2 v20.4S, v3.8H, v17.8H // *.............................. + // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... + // smlal v12.4S, v14.4H, v0.4H // .*............................. + // smlal v23.4S, v10.4H, v4.4H // ..*............................ + // str q9, [x0, #16] // ...*........................... + // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... + // smlal v23.4S, v13.4H, v15.4H // .......*....................... + // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... + // zip1 v26.8H, v19.8H, v27.8H // ......*........................ + // smlal v23.4S, v28.4H, v22.4H // ..........*.................... + // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... + // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. + // str q26, [x0], #32 // .........*..................... + // mul v31.8H, v5.8H, v2.8H // .............*................. + // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ + // smlal v23.4S, v31.4H, v0.4H // ...............*............... + // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. + // zip2 v9.8H, v19.8H, v27.8H // .................*............. + // str q9, [x0, #16] // ...................*........... + // zip1 v26.8H, v19.8H, v27.8H // ..................*............ + // str q26, [x0], #32 // ....................*.......... + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S new file mode 100644 index 000000000..1c30ed6aa --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3_opt.S @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 3 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + + mov count, #(MLKEM_N / 16) + // Instructions: 75 + // Expected cycles: 103 + // Expected IPC: 0.73 + + // Cycle bound: 103.0 + // IPC bound: 0.73 + + // Wall time: 0.94s + // User time: 0.94s + + // --------------------------- original position ----------------------------> + // 0 25 50 + // |------------------------|------------------------| + ldr q7, [x2, #16] // *.......................................................................... + ldr q20, [x2], #32 // ..*........................................................................ + ldr q15, [x1, #16] // .*......................................................................... + uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... + uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... + ld1 {v20.8H}, [x3], #16 // ...*....................................................................... + ldr q30, [x1], #32 // ..............*............................................................ + ldr q11, [x4], #32 // ....*...................................................................... + uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... + uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ + smull v30.4S, v16.4H, v7.4H // ...................*....................................................... + smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... + smull v9.4S, v16.4H, v8.4H // .....................*..................................................... + smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... + smlal v30.4S, v15.4H, v8.4H // .......................*................................................... + smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. + smlal v9.4S, v15.4H, v20.4H // .........................*................................................. + smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ + ldr q20, [x4, #-16] // .....*..................................................................... + ldr q15, [x5], #32 // ......*.................................................................... + uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... + uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. + ldr q11, [x5, #-16] // .......*................................................................... + ld1 {v27.8H}, [x6], #16 // ........*.................................................................. + uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. + uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ + smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... + smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... + smlal v30.4S, v8.4H, v15.4H // .................................*......................................... + smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ + smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... + smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... + smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... + smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... + ldr q20, [x7], #32 // .........*................................................................. + ldr q15, [x7, #-16] // ..........*................................................................ + ldr q8, [x8], #32 // ...........*............................................................... + uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... + uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. + ldr q15, [x8, #-16] // ............*.............................................................. + ld1 {v27.8H}, [x9], #16 // .............*............................................................. + uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. + uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ + smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... + smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. + smlal v30.4S, v11.4H, v15.4H // .............................................*............................. + smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ + smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... + smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... + smlal v30.4S, v20.4H, v10.4H // .................................................*......................... + smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ + ldr q15, [x2], #32 // ...............................................................*........... + uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... + uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... + mul v20.8H, v20.8H, v2.8H // ......................................................*.................... + mul v8.8H, v8.8H, v2.8H // .......................................................*................... + ldr q21, [x4], #32 // .................................................................*......... + smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. + smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. + smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ + smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... + ldr q6, [x4, #-16] // ..................................................................*........ + uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. + uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. + ldr q16, [x2, #-16] // ...................................................*....................... + ldr q30, [x1, #16] // ..............................................................*............ + ld1 {v9.8H}, [x3], #16 // ................................................................*.......... + ldr q1, [x5], #32 // ...................................................................*....... + ldr q12, [x5, #-16] // ....................................................................*...... + ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + ldr q19, [x7], #32 // ......................................................................*.... + ldr q31, [x7, #-16] // .......................................................................*... + ldr q17, [x8], #32 // ........................................................................*.. + ldr q18, [x8, #-16] // .........................................................................*. + ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + // ------------------------------ new position ------------------------------> + // 0 25 50 + // |------------------------|------------------------|------------------------ + // ldr q16, [x2, #16] // *.......................................................................... + // ldr q30, [x1, #16] // ..*........................................................................ + // ldr q15, [x2], #32 // .*......................................................................... + // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... + // ldr q21, [x4], #32 // .......*................................................................... + // ldr q6, [x4, #-16] // ..................*........................................................ + // ldr q1, [x5], #32 // ...................*....................................................... + // ldr q12, [x5, #-16] // ......................*.................................................... + // ld1 {v24.8H}, [x6], #16 // .......................*................................................... + // ldr q19, [x7], #32 // ..................................*........................................ + // ldr q31, [x7, #-16] // ...................................*....................................... + // ldr q17, [x8], #32 // ....................................*...................................... + // ldr q18, [x8, #-16] // .......................................*................................... + // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. + // ldr q20, [x1], #32 // ......*.................................................................... + // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. + // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. + // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ + // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... + // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. + // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. + // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ + // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... + // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... + // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... + // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... + // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. + // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. + // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ + // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... + // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. + // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. + // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ + // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... + // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... + // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... + // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... + // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... + // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. + // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ + // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... + // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. + // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ + // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... + // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... + // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ + // ldr q16, [x2, #16] // ................................................................*.......... + // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... + // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... + // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... + // mul v20.8H, v20.8H, v2.8H // .......................................................*................... + // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. + // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ + // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... + // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. + // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ + // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... + // ldr q30, [x1, #16] // .................................................................*......... + // ldr q15, [x2], #32 // ...................................................*....................... + // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ + // ldr q21, [x4], #32 // ........................................................*.................. + // ldr q6, [x4, #-16] // .............................................................*............. + // ldr q1, [x5], #32 // ...................................................................*....... + // ldr q12, [x5, #-16] // ....................................................................*...... + // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... + // ldr q19, [x7], #32 // ......................................................................*.... + // ldr q31, [x7, #-16] // .......................................................................*... + // ldr q17, [x8], #32 // ........................................................................*.. + // ldr q18, [x8, #-16] // .........................................................................*. + // ld1 {v25.8H}, [x9], #16 // ..........................................................................* + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop: + // Instructions: 65 + // Expected cycles: 80 + // Expected IPC: 0.81 + + // Cycle bound: 80.0 + // IPC bound: 0.81 + + // Wall time: 11.64s + // User time: 11.64s + + // ---------------------- original position -----------------------> + // 0 25 50 + // |------------------------|------------------------|-------------- + ldr q20, [x1], #32 // *................................................................ + uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... + uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... + uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. + uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. + smull v30.4S, v8.4H, v15.4H // .............*................................................... + smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. + smull v11.4S, v8.4H, v7.4H // .........*....................................................... + smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... + smlal v30.4S, v20.4H, v7.4H // ...............*................................................. + smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ + smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... + smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... + uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. + uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ + uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... + uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ + smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... + smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... + smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. + smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. + smlal v11.4S, v20.4H, v24.4H // ............................*.................................... + smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... + smlal v30.4S, v20.4H, v16.4H // ................................*................................ + smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... + uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ + uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... + uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ + uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... + smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... + smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... + smlal v30.4S, v7.4H, v9.4H // ...............................................*................. + smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ + smlal v11.4S, v20.4H, v25.4H // .............................................*................... + smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. + smlal v30.4S, v20.4H, v16.4H // .................................................*............... + smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. + ldr q16, [x2, #16] // .....e........................................................... + uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. + uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ + mul v7.8H, v7.8H, v2.8H // ....................................................*............ + mul v20.8H, v20.8H, v2.8H // .........................................................*....... + zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. + zip1 v27.8H, v27.8H, v10.8H // .............................................................l... + smlal v11.4S, v7.4H, v0.4H // .....................................................*........... + smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... + smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... + smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... + str q27, [x0], #32 // ...............................................................l. + uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... + str q9, [x0, #-16] // ................................................................l + uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... + ldr q30, [x1, #16] // .e............................................................... + ldr q15, [x2], #32 // ....e............................................................ + ld1 {v9.8H}, [x3], #16 // ........e........................................................ + ldr q21, [x4], #32 // .................e............................................... + ldr q6, [x4, #-16] // ..................e.............................................. + ldr q1, [x5], #32 // .....................e........................................... + ldr q12, [x5, #-16] // ......................e.......................................... + ld1 {v24.8H}, [x6], #16 // .........................e....................................... + ldr q19, [x7], #32 // ..................................e.............................. + ldr q31, [x7, #-16] // ...................................e............................. + ldr q17, [x8], #32 // ......................................e.......................... + ldr q18, [x8, #-16] // .......................................e......................... + ld1 {v25.8H}, [x9], #16 // ..........................................e...................... + + // ---------------------------------------------------------------- new position -----------------------------------------------------------------> + // 0 25 50 75 100 125 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ + // ldr q12, [x1], #32 // ............................*................................................................~.................................................. + // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. + // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. + // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. + // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ + // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. + // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... + // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... + // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... + // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... + // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. + // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ + // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ + // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. + // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... + // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. + // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. + // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ + // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. + // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... + // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. + // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... + // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ + // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. + // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. + // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... + // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... + // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. + // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... + // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... + // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. + // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ + // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... + // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. + // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... + // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. + // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ + // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... + // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... + // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... + // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. + // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... + // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... + // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... + // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... + // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. + // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... + // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ + // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. + // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k3_opt_loop + // Instructions: 55 + // Expected cycles: 61 + // Expected IPC: 0.90 + + // Cycle bound: 61.0 + // IPC bound: 0.90 + + // Wall time: 8.41s + // User time: 8.41s + + // ----------------- original position ------------------> + // 0 25 50 + // |------------------------|------------------------|---- + ldr q7, [x1], #32 // *...................................................... + uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... + uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... + uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. + smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. + smull v5.4S, v23.4H, v20.4H // .......*............................................... + smull2 v30.4S, v23.8H, v15.8H // ......*................................................ + uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... + smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... + smlal v5.4S, v11.4H, v9.4H // ...........*........................................... + uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... + smull v16.4S, v23.4H, v15.4H // .....*................................................. + smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... + smlal v5.4S, v3.4H, v28.4H // .................*..................................... + uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ + uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... + smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ + uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. + smlal v16.4S, v11.4H, v20.4H // .........*............................................. + smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ + smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ + uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... + uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ + smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. + smlal v16.4S, v3.4H, v20.4H // ...................*................................... + smlal v5.4S, v29.4H, v24.4H // .....................*................................. + uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... + smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. + smlal v16.4S, v29.4H, v28.4H // .......................*............................... + smlal v5.4S, v14.4H, v7.4H // .............................*......................... + smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... + smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... + smlal v16.4S, v14.4H, v9.4H // ...............................*....................... + smlal v5.4S, v21.4H, v25.4H // .................................*..................... + zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ + smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. + smlal v16.4S, v21.4H, v7.4H // ...................................*................... + uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. + str q20, [x0], #32 // ...............................................*....... + mul v15.8H, v7.8H, v2.8H // .......................................*............... + uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ + zip2 v31.8H, v27.8H, v10.8H // .........................................*............. + mul v20.8H, v7.8H, v2.8H // ........................................*.............. + smlal v5.4S, v15.4H, v0.4H // ...........................................*........... + smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... + str q31, [x0, #-16] // .................................................*..... + smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ + smlal v16.4S, v20.4H, v0.4H // .............................................*......... + uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... + uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... + zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. + zip2 v20.8H, v15.8H, v20.8H // ...................................................*... + str q7, [x0], #32 // .....................................................*. + str q20, [x0, #-16] // ......................................................* + + // -------------------- new position --------------------> + // 0 25 50 + // |------------------------|------------------------|---- + // ldr q20, [x1], #32 // *...................................................... + // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... + // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... + // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... + // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. + // smull v30.4S, v8.4H, v15.4H // ............*.......................................... + // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... + // smull v11.4S, v8.4H, v7.4H // ......*................................................ + // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. + // smlal v30.4S, v20.4H, v7.4H // ...................*................................... + // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. + // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ + // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. + // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... + // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... + // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. + // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ + // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ + // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... + // smlal v30.4S, v7.4H, v9.4H // .........................*............................. + // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. + // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ + // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... + // smlal v30.4S, v20.4H, v16.4H // .............................*......................... + // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... + // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... + // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... + // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... + // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... + // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ + // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. + // smlal v30.4S, v7.4H, v9.4H // .................................*..................... + // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... + // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... + // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... + // smlal v30.4S, v20.4H, v16.4H // .....................................*................. + // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. + // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ + // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. + // mul v7.8H, v7.8H, v2.8H // ........................................*.............. + // mul v20.8H, v20.8H, v2.8H // ...........................................*........... + // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ + // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... + // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... + // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... + // smlal v30.4S, v20.4H, v0.4H // ................................................*...... + // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... + // str q27, [x0], #32 // .......................................*............... + // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... + // str q9, [x0, #-16] // ..............................................*........ + // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... + // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. + // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... + // str q27, [x0], #32 // .....................................................*. + // str q9, [x0, #-16] // ......................................................* + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 3 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S new file mode 100644 index 000000000..c3d70ed42 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4_opt.S @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// AArch64 re-implementation of the asymmetric base multiplication from: + +// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 +// https://eprint.iacr.org/2021/986 +// https://github.com/neon-ntt/neon-ntt + +#include "../../../common.h" +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) && MLKEM_K == 4 +/* simpasm: header-end */ + +// Input: +// - Vectors al, ah of 32-bit entries +// Output: +// - Montgomery reductions of al || ah, stored in al +.macro montgomery_reduce_long x, a + uzp1 t0.8h, \a\()l.8h, \a\()h.8h + mul t0.8h, t0.8h, modulus_twisted.8h + smlal \a\()l.4s, t0.4h, modulus.4h + smlal2 \a\()h.4s, t0.8h, modulus.8h + uzp2 \x\().8h, \a\()l.8h, \a\()h.8h +.endm + +// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. + +// Bounds: +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 +.macro pmull d, a, b + smull \d\()0l.4s, \a\()0.4h, \b\()0.4h + smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smull \d\()1l.4s, \a\()0.4h, \b\()1.4h + smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro pmlal d, a, b + smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h + smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h + smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h + smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h + + smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h + smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h + smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h + smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h +.endm + +.macro ld2_wrap a, ptr + ldr q_tmp0, [\ptr\()], #32 + ldr q_tmp1, [\ptr\(), #-16] + uzp1 \a\()0.8h, tmp0.8h, tmp1.8h + uzp2 \a\()1.8h, tmp0.8h, tmp1.8h +.endm + +.macro st2_wrap a, ptr + zip1 tmp0.8h, \a\()0.8h, \a\()1.8h + zip2 tmp1.8h, \a\()0.8h, \a\()1.8h + str q_tmp0, [\ptr\()], #32 + str q_tmp1, [\ptr\(), #-16] +.endm + +.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr + ld2_wrap \a\(), \a_ptr + ld2_wrap \b\(), \b_ptr + ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + + out .req x0 + a0_ptr .req x1 + b0_ptr .req x2 + b0_cache_ptr .req x3 + a1_ptr .req x4 + b1_ptr .req x5 + b1_cache_ptr .req x6 + a2_ptr .req x7 + b2_ptr .req x8 + b2_cache_ptr .req x9 + a3_ptr .req x10 + b3_ptr .req x11 + b3_cache_ptr .req x12 + count .req x13 + wtmp .req w14 + + modulus .req v0 + modulus_twisted .req v2 + + aa0 .req v3 + aa1 .req v4 + bb0 .req v5 + bb1 .req v6 + bb1t .req v7 + + res0l .req v8 + res1l .req v9 + res0h .req v10 + res1h .req v11 + + tmp0 .req v12 + tmp1 .req v13 + q_tmp0 .req q12 + q_tmp1 .req q13 + + out0 .req v26 + out1 .req v27 + + t0 .req v28 + + .text + .global MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt) + .balign 4 +MLKEM_ASM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4_opt): + push_stack + mov wtmp, #3329 + dup modulus.8h, wtmp + + mov wtmp, #3327 + dup modulus_twisted.8h, wtmp + + // Computed bases of vector entries + + add a1_ptr, a0_ptr, #(1 * 512) + add b1_ptr, b0_ptr, #(1 * 512) + add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) + add a2_ptr, a0_ptr, #(2 * 512) + add b2_ptr, b0_ptr, #(2 * 512) + add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) + add a3_ptr, a0_ptr, #(3 * 512) + add b3_ptr, b0_ptr, #(3 * 512) + add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) + + // Bounds: + + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. + + mov count, #(MLKEM_N / 16) + // Instructions: 114 + // Expected cycles: 153 + // Expected IPC: 0.75 + // + // Cycle bound: 153.0 + // IPC bound: 0.75 + // + // Wall time: 0.69s + // User time: 0.69s + // + // ----------------------------------------------- original position -----------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + ldr q23, [x2, #16] // .*................................................................................................................ + ldr q19, [x2], #32 // *................................................................................................................. + ldr q17, [x5], #32 // ..*............................................................................................................... + uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... + uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... + ldr q23, [x5, #-16] // ...*.............................................................................................................. + ldr q30, [x1, #16] // .....*............................................................................................................ + uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. + uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... + ldr q17, [x1], #32 // ......*........................................................................................................... + ldr q10, [x7, #16] // .............*.................................................................................................... + uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... + uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ + smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... + smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... + smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ + smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... + smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. + smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. + ldr q19, [x4], #32 // ....................*............................................................................................. + ldr q16, [x4, #-16] // .....................*............................................................................................ + ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. + uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... + uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... + smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ + smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... + smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... + smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ + smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. + smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... + smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... + smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ + ldr q23, [x7], #32 // ......................*........................................................................................... + ldr q17, [x8, #16] // ..............*................................................................................................... + uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... + uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. + ldr q10, [x10], #32 // ...............*.................................................................................................. + ldr q16, [x10, #-16] // ................*................................................................................................. + ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ + uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... + uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. + ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ + ldr q3, [x11, #16] // ...........................*...................................................................................... + smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... + smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... + ldr q19, [x11], #32 // ............................*..................................................................................... + ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... + uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... + uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... + ldr q3, [x8], #32 // ..............................*................................................................................... + ldr q31, [x2], #32 // ......................................*........................................................................... + uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. + uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ + smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... + smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... + smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... + smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... + smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... + smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. + smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. + smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ + smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... + smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. + smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. + smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ + smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... + smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... + smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... + smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ + ldr q19, [x2, #-16] // .........................................*........................................................................ + uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... + uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. + mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... + uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. + uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. + mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ + smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ + smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... + ldr q23, [x5], #32 // .............................................*.................................................................... + smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... + uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... + smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... + ldr q17, [x5, #-16] // ................................................*................................................................. + ldr q13, [x1, #16] // ......................................................*........................................................... + uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. + uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... + uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. + ldr q23, [x1], #32 // ..........................................................................*....................................... + zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* + ldr q3, [x7, #16] // ........................................................................................*......................... + uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... + uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. + smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... + ldr q6, [x8, #16] // .........................................................................................*........................ + ldr q23, [x10], #32 // ..........................................................................................*....................... + smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... + ldr q17, [x10, #-16] // ...........................................................................................*...................... + ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... + uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... + uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... + ldr q23, [x4], #32 // ...............................................................................................*.................. + ldr q17, [x4, #-16] // ................................................................................................*................. + ldr q4, [x7], #32 // .................................................................................................*................ + uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... + uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. + uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ + smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... + ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. + ldr q25, [x11, #16] // ......................................................................................................*........... + ldr q29, [x11], #32 // .......................................................................................................*.......... + ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. + ldr q14, [x8], #32 // .........................................................................................................*........ + ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... + + // ------------------------------------------------- new position --------------------------------------------------> + // 0 25 50 75 100 + // |------------------------|------------------------|------------------------|------------------------|------------- + // ldr q3, [x2], #32 // .*................................................................................................................ + // ldr q17, [x2, #-16] // *................................................................................................................. + // ldr q21, [x5], #32 // ..*............................................................................................................... + // ldr q19, [x5, #-16] // .....*............................................................................................................ + // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... + // ldr q25, [x1, #16] // ......*........................................................................................................... + // ldr q22, [x1], #32 // .........*........................................................................................................ + // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... + // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... + // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... + // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. + // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. + // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... + // ldr q3, [x7, #16] // ..........*....................................................................................................... + // ldr q6, [x8, #16] // .................................*................................................................................ + // ldr q8, [x10], #32 // ....................................*............................................................................. + // ldr q26, [x10, #-16] // .....................................*............................................................................ + // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... + // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... + // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... + // ldr q8, [x4], #32 // ...................*.............................................................................................. + // ldr q26, [x4, #-16] // ....................*............................................................................................. + // ldr q4, [x7], #32 // ................................*................................................................................. + // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... + // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... + // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ + // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... + // ldr q25, [x11, #16] // ..........................................*....................................................................... + // ldr q29, [x11], #32 // .............................................*.................................................................... + // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... + // ldr q14, [x8], #32 // .................................................*................................................................ + // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ + // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ + // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. + // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... + // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. + // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. + // ldr q3, [x2], #32 // ..................................................*............................................................... + // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. + // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... + // ldr q17, [x2, #-16] // .....................................................................*............................................ + // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. + // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... + // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... + // ldr q21, [x5], #32 // ..............................................................................*................................... + // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... + // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... + // ldr q19, [x5, #-16] // ..................................................................................*............................... + // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... + // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ + // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. + // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. + // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. + // ldr q25, [x1, #16] // ...................................................................................*.............................. + // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... + // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... + // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. + // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ + // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... + // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... + // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... + // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ + // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... + // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... + // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... + // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... + // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... + // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. + // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. + // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ + // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... + // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. + // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. + // ldr q22, [x1], #32 // .......................................................................................*.......................... + // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... + // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ + // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... + // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... + // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... + // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ + // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... + // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... + // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... + // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... + // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... + // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. + // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... + // ldr q3, [x7, #16] // .........................................................................................*........................ + // ldr q6, [x8, #16] // .............................................................................................*.................... + // ldr q8, [x10], #32 // ..............................................................................................*................... + // ldr q26, [x10, #-16] // ................................................................................................*................. + // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ + // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... + // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. + // ldr q8, [x4], #32 // ....................................................................................................*............. + // ldr q26, [x4, #-16] // .....................................................................................................*............ + // ldr q4, [x7], #32 // ......................................................................................................*........... + // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... + // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... + // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... + // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ + // ldr q25, [x11, #16] // ............................................................................................................*..... + // ldr q29, [x11], #32 // .............................................................................................................*.... + // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... + // ldr q14, [x8], #32 // ................................................................................................................*. + // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. + // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. + // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ + // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* + // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... + // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. + // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... + + sub count, count, #2 +polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop: + // Instructions: 82 + // Expected cycles: 102 + // Expected IPC: 0.80 + // + // Cycle bound: 102.0 + // IPC bound: 0.80 + // + // Wall time: 15.93s + // User time: 15.93s + // + // ------------------------------- original position -------------------------------> + // 0 25 50 75 + // |------------------------|------------------------|------------------------|------ + smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ + uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ + smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... + ldr q3, [x2], #32 // ....e............................................................................. + uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... + smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... + ldr q17, [x2, #-16] // .....e............................................................................ + smull v18.4S, v31.4H, v19.4H // .........*........................................................................ + smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... + smull v29.4S, v31.4H, v21.4H // .............*.................................................................... + ldr q21, [x5], #32 // .....................e............................................................ + smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... + smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. + ldr q19, [x5, #-16] // ......................e........................................................... + smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... + smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... + uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... + uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... + smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... + ldr q25, [x1, #16] // .e................................................................................ + smlal v29.4S, v26.4H, v28.4H // ................................*................................................. + smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... + uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ + smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... + smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. + smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. + smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... + smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... + smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... + smlal v29.4S, v4.4H, v31.4H // .................................................*................................ + smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... + smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... + smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ + smlal v29.4S, v30.4H, v1.4H // ................................................................*................. + smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... + smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. + smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. + smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... + smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... + ldr q22, [x1], #32 // e................................................................................. + uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ + uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... + mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... + uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... + uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. + uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... + smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... + smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... + uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... + uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. + zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. + mul v23.8H, v26.8H, v2.8H // .....................................................................*............ + uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... + smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... + str q14, [x0, #16] // .................................................................................l + ldr q3, [x7, #16] // ...................................e.............................................. + ldr q6, [x8, #16] // .......................................e.......................................... + ldr q8, [x10], #32 // ...................................................e.............................. + ldr q26, [x10, #-16] // ....................................................e............................. + ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... + uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ + uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... + ldr q8, [x4], #32 // .................e................................................................ + ldr q26, [x4, #-16] // ..................e............................................................... + ldr q4, [x7], #32 // ..................................e............................................... + uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. + uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. + ld1 {v8.8H}, [x6], #16 // .........................e........................................................ + uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. + ldr q25, [x11, #16] // ........................................................e......................... + ldr q29, [x11], #32 // .......................................................e.......................... + ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... + ldr q14, [x8], #32 // ......................................e........................................... + smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. + smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... + smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... + ld1 {v23.8H}, [x3], #16 // ........e......................................................................... + smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. + uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... + uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ + str q5, [x0], #32 // ................................................................................l. + zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... + + // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 200 225 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- + // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... + // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. + // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... + // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... + // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. + // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ + // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... + // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... + // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... + // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. + // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... + // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... + // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... + // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... + // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... + // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. + // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. + // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. + // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... + // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... + // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... + // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... + // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. + // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. + // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ + // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... + // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... + // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. + // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... + // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ + // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ + // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ + // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... + // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ + // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... + // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ + // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ + // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ + // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... + // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... + // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... + // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... + // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. + // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... + // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... + // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ + // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... + // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... + // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... + // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... + // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... + // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... + // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... + // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... + // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. + // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ + // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... + // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. + // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. + // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... + // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. + // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... + // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ + // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... + // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ + // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... + // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. + // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... + // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... + // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. + // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ + // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... + // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. + // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. + // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ + // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ + // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. + // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l + // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... + + sub count, count, #1 + cbnz count, polyvec_basemul_acc_montgomery_cached_asm_k4_opt_loop + + // Instructions: 50 + // Expected cycles: 56 + // Expected IPC: 0.89 + // + // Cycle bound: 56.0 + // IPC bound: 0.89 + // + // Wall time: 4.16s + // User time: 4.16s + // + // --------------- original position ---------------> + // 0 25 + // |------------------------| + smull2 v17.4S, v31.8H, v19.8H // ..*............................................... + uzp2 v1.8H, v14.8H, v6.8H // ................*................................. + smull v18.4S, v31.4H, v21.4H // .......*.......................................... + smlal2 v24.4S, v26.8H, v28.8H // *................................................. + smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. + smull v21.4S, v31.4H, v19.4H // .....*............................................ + smlal v18.4S, v16.4H, v19.4H // .........*........................................ + uzp2 v31.8H, v4.8H, v3.8H // .*................................................ + uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... + smlal v21.4S, v16.4H, v23.4H // ..........*....................................... + smlal v18.4S, v20.4H, v27.4H // ...........*...................................... + uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. + smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... + smlal v21.4S, v20.4H, v28.4H // .............*.................................... + smlal v18.4S, v26.4H, v28.4H // ..............*................................... + smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... + smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... + smlal v21.4S, v26.4H, v8.4H // ...............*.................................. + smlal v18.4S, v9.4H, v1.4H // ...................*.............................. + smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... + smlal2 v17.4S, v9.8H, v3.8H // .................*................................ + smlal v21.4S, v9.4H, v3.4H // ....................*............................. + smlal v18.4S, v31.4H, v3.4H // .......................*.......................... + smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... + smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ + smlal v21.4S, v31.4H, v12.4H // ........................*......................... + smlal v18.4S, v30.4H, v14.4H // ...........................*...................... + smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... + smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ + smlal v21.4S, v30.4H, v10.4H // ............................*..................... + smlal v18.4S, v11.4H, v10.4H // ...............................*.................. + zip2 v19.8H, v7.8H, v15.8H // ......................................*........... + smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... + smlal v21.4S, v11.4H, v22.4H // ................................*................. + uzp1 v23.8H, v18.8H, v24.8H // .................................*................ + str q19, [x0, #16] // .........................................*........ + mul v19.8H, v23.8H, v2.8H // ..................................*............... + uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ + str q5, [x0], #32 // .............................................*.... + mul v26.8H, v23.8H, v2.8H // .......................................*.......... + smlal v18.4S, v19.4H, v0.4H // ...................................*.............. + smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. + smlal v21.4S, v26.4H, v0.4H // ...........................................*...... + smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... + uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... + uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... + zip1 v23.8H, v19.8H, v13.8H // ..............................................*... + zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. + str q23, [x0], #32 // .................................................* + str q19, [x0, #-16] // ................................................*. + + // ----------------- new position ------------------> + // 0 25 + // |------------------------|------------------------ + // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. + // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... + // smull2 v13.4S, v31.8H, v19.8H // *................................................. + // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... + // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. + // smull v18.4S, v31.4H, v19.4H // .....*............................................ + // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... + // smull v29.4S, v31.4H, v21.4H // ..*............................................... + // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. + // smlal v29.4S, v16.4H, v19.4H // ......*........................................... + // smlal v18.4S, v16.4H, v23.4H // .........*........................................ + // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... + // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... + // smlal v18.4S, v20.4H, v28.4H // .............*.................................... + // smlal v29.4S, v26.4H, v28.4H // ..............*................................... + // smlal v18.4S, v26.4H, v8.4H // .................*................................ + // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ + // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. + // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. + // smlal v29.4S, v9.4H, v26.4H // ..................*............................... + // smlal v18.4S, v9.4H, v31.4H // .....................*............................ + // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... + // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. + // smlal v29.4S, v4.4H, v31.4H // ......................*........................... + // smlal v18.4S, v4.4H, v12.4H // .........................*........................ + // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... + // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... + // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... + // smlal v18.4S, v30.4H, v10.4H // .............................*.................... + // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. + // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... + // smlal v29.4S, v11.4H, v10.4H // ..............................*................... + // smlal v18.4S, v11.4H, v22.4H // .................................*................ + // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... + // mul v19.8H, v31.8H, v2.8H // ....................................*............. + // smlal v29.4S, v19.4H, v0.4H // ........................................*......... + // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ + // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ + // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. + // mul v23.8H, v26.8H, v2.8H // .......................................*.......... + // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... + // str q14, [x0, #16] // ...................................*.............. + // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... + // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... + // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... + // str q5, [x0], #32 // ......................................*........... + // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... + // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. + // str q14, [x0, #16] // .................................................* + // str q5, [x0], #32 // ................................................*. + + + pop_stack + ret + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq out + .unreq a0_ptr + .unreq b0_ptr + .unreq b0_cache_ptr + .unreq a1_ptr + .unreq b1_ptr + .unreq b1_cache_ptr + .unreq a2_ptr + .unreq b2_ptr + .unreq b2_cache_ptr + .unreq a3_ptr + .unreq b3_ptr + .unreq b3_cache_ptr + .unreq count + .unreq modulus + .unreq modulus_twisted + .unreq wtmp + .unreq aa0 + .unreq aa1 + .unreq bb0 + .unreq bb1 + .unreq bb1t + .unreq res0l + .unreq res1l + .unreq res0h + .unreq res1h + .unreq tmp0 + .unreq tmp1 + .unreq q_tmp0 + .unreq q_tmp1 + .unreq out0 + .unreq out1 + .unreq t0 + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT && MLKEM_K == 4 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S deleted file mode 100644 index 94f0889b7..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_clean.S +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ -// -// AArch64 re-implementation of the asymmetric base multiplication from: -// -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. -// -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) -k2_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k2_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) -k3_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k3_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_clean): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - // - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) -k4_loop_start: - - load_polys aa, bb, a0_ptr, b0_ptr, b0_cache_ptr - pmull res, aa, bb - load_polys aa, bb, a1_ptr, b1_ptr, b1_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a2_ptr, b2_ptr, b2_cache_ptr - pmlal res, aa, bb - load_polys aa, bb, a3_ptr, b3_ptr, b3_cache_ptr - pmlal res, aa, bb - - montgomery_reduce_long out0, res0 - montgomery_reduce_long out1, res1 - - st2_wrap out, out - - subs count, count, #1 - cbnz count, k4_loop_start - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq wtmp - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S deleted file mode 100644 index 275ca06d2..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/polyvec_opt.S +++ /dev/null @@ -1,1606 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// AArch64 re-implementation of the asymmetric base multiplication from: - -// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1 -// https://eprint.iacr.org/2021/986 -// https://github.com/neon-ntt/neon-ntt - -#include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) - -// Input: -// - Vectors al, ah of 32-bit entries -// Output: -// - Montgomery reductions of al || ah, stored in al -.macro montgomery_reduce_long x, a - uzp1 t0.8h, \a\()l.8h, \a\()h.8h - mul t0.8h, t0.8h, modulus_twisted.8h - smlal \a\()l.4s, t0.4h, modulus.4h - smlal2 \a\()h.4s, t0.8h, modulus.8h - uzp2 \x\().8h, \a\()l.8h, \a\()h.8h -.endm - -// Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. - -// Bounds: -// - Assume |a| < 4096, -// - Result: < 2*4096*2^15 = 2^28 -.macro pmull d, a, b - smull \d\()0l.4s, \a\()0.4h, \b\()0.4h - smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smull \d\()1l.4s, \a\()0.4h, \b\()1.4h - smull2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro pmlal d, a, b - smlal \d\()0l.4s, \a\()0.4h, \b\()0.4h - smlal2 \d\()0h.4s, \a\()0.8h, \b\()0.8h - smlal \d\()0l.4s, \a\()1.4h, \b\()1t.4h - smlal2 \d\()0h.4s, \a\()1.8h, \b\()1t.8h - - smlal \d\()1l.4s, \a\()0.4h, \b\()1.4h - smlal2 \d\()1h.4s, \a\()0.8h, \b\()1.8h - smlal \d\()1l.4s, \a\()1.4h, \b\()0.4h - smlal2 \d\()1h.4s, \a\()1.8h, \b\()0.8h -.endm - -.macro ld2_wrap a, ptr - ldr q_tmp0, [\ptr\()], #32 - ldr q_tmp1, [\ptr\(), #-16] - uzp1 \a\()0.8h, tmp0.8h, tmp1.8h - uzp2 \a\()1.8h, tmp0.8h, tmp1.8h -.endm - -.macro st2_wrap a, ptr - zip1 tmp0.8h, \a\()0.8h, \a\()1.8h - zip2 tmp1.8h, \a\()0.8h, \a\()1.8h - str q_tmp0, [\ptr\()], #32 - str q_tmp1, [\ptr\(), #-16] -.endm - -.macro load_polys a, b, a_ptr, b_ptr, b_cache_ptr - ld2_wrap \a\(), \a_ptr - ld2_wrap \b\(), \b_ptr - ld1 {\b\()1t.8h}, [\b_cache_ptr], #16 -.endm - -.macro save_vregs - sub sp, sp, #(16*4) - stp d8, d9, [sp, #16*0] - stp d10, d11, [sp, #16*1] - stp d12, d13, [sp, #16*2] - stp d14, d15, [sp, #16*3] -.endm - -.macro restore_vregs - ldp d8, d9, [sp, #16*0] - ldp d10, d11, [sp, #16*1] - ldp d12, d13, [sp, #16*2] - ldp d14, d15, [sp, #16*3] - add sp, sp, #(16*4) -.endm - -.macro push_stack - save_vregs -.endm - -.macro pop_stack - restore_vregs -.endm - - out .req x0 - a0_ptr .req x1 - b0_ptr .req x2 - b0_cache_ptr .req x3 - a1_ptr .req x4 - b1_ptr .req x5 - b1_cache_ptr .req x6 - a2_ptr .req x7 - b2_ptr .req x8 - b2_cache_ptr .req x9 - a3_ptr .req x10 - b3_ptr .req x11 - b3_cache_ptr .req x12 - count .req x13 - wtmp .req w14 - - modulus .req v0 - modulus_twisted .req v2 - - aa0 .req v3 - aa1 .req v4 - bb0 .req v5 - bb1 .req v6 - bb1t .req v7 - - res0l .req v8 - res1l .req v9 - res0h .req v10 - res1h .req v11 - - tmp0 .req v12 - tmp1 .req v13 - q_tmp0 .req q12 - q_tmp1 .req q13 - - out0 .req v26 - out1 .req v27 - - t0 .req v28 - -#if MLKEM_K == 2 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 94 - // Expected IPC: 0.80 - - // Cycle bound: 94.0 - // IPC bound: 0.80 - - // Wall time: 1.49s - // User time: 1.49s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q9, [x4], #32 // *.......................................................................... - ldr q5, [x4, #-16] // ......*.................................................................... - ldr q11, [x5], #32 // .*......................................................................... - uzp1 v23.8H, v9.8H, v5.8H // .........*................................................................. - uzp2 v9.8H, v9.8H, v5.8H // .....................*..................................................... - ldr q5, [x2], #32 // ..*........................................................................ - ldr q7, [x5, #-16] // ..............*............................................................ - ldr q21, [x2, #-16] // ...*....................................................................... - uzp2 v10.8H, v11.8H, v7.8H // .................*......................................................... - uzp1 v11.8H, v11.8H, v7.8H // ..................*........................................................ - uzp1 v7.8H, v5.8H, v21.8H // ....*...................................................................... - uzp2 v5.8H, v5.8H, v21.8H // .....*..................................................................... - ldr q21, [x1], #32 // .......*................................................................... - ldr q25, [x1, #-16] // ........*.................................................................. - ld1 {v6.8H}, [x3], #16 // ............................*.............................................. - uzp1 v26.8H, v21.8H, v25.8H // ..........*................................................................ - uzp2 v21.8H, v21.8H, v25.8H // ...........*............................................................... - smull v25.4S, v26.4H, v5.4H // ............*.............................................................. - smull2 v5.4S, v26.8H, v5.8H // .............*............................................................. - smull v19.4S, v26.4H, v7.4H // ..........................*................................................ - smull2 v26.4S, v26.8H, v7.8H // ..............................*............................................ - smlal v25.4S, v21.4H, v7.4H // ...............*........................................................... - smlal2 v5.4S, v21.8H, v7.8H // ................*.......................................................... - smlal v19.4S, v21.4H, v6.4H // ...................................*....................................... - smlal2 v26.4S, v21.8H, v6.8H // .................................*......................................... - smlal v25.4S, v23.4H, v10.4H // ...................*....................................................... - smlal2 v5.4S, v23.8H, v10.8H // ....................*...................................................... - smlal v19.4S, v23.4H, v11.4H // ......................................*.................................... - smlal2 v26.4S, v23.8H, v11.8H // ....................................*...................................... - ld1 {v23.8H}, [x6], #16 // ........................*.................................................. - smlal v25.4S, v9.4H, v11.4H // ......................*.................................................... - smlal2 v5.4S, v9.8H, v11.8H // .......................*................................................... - smlal2 v26.4S, v9.8H, v23.8H // .......................................*................................... - smlal v19.4S, v9.4H, v23.4H // .........................................*................................. - ldr q9, [x4], #32 // ...............................*........................................... - uzp1 v11.8H, v25.8H, v5.8H // .........................*................................................. - uzp1 v23.8H, v19.8H, v26.8H // .............................................*............................. - mul v11.8H, v11.8H, v2.8H // ...........................*............................................... - mul v23.8H, v23.8H, v2.8H // ..............................................*............................ - ldr q7, [x5], #32 // ................................*.......................................... - smlal2 v5.4S, v11.8H, v0.8H // .............................*............................................. - smlal v25.4S, v11.4H, v0.4H // ..................................*........................................ - ldr q11, [x2], #32 // .....................................*..................................... - ldr q21, [x2, #-16] // ........................................*.................................. - ldr q6, [x4, #-16] // ...............................................*........................... - uzp1 v17.8H, v11.8H, v21.8H // ...........................................*............................... - ldr q10, [x1], #32 // ................................................*.......................... - ldr q29, [x1, #-16] // .................................................*......................... - uzp2 v11.8H, v11.8H, v21.8H // ............................................*.............................. - uzp1 v13.8H, v9.8H, v6.8H // ...................................................*....................... - uzp1 v3.8H, v10.8H, v29.8H // ....................................................*...................... - uzp2 v10.8H, v10.8H, v29.8H // .....................................................*..................... - smull v12.4S, v3.4H, v11.4H // ......................................................*.................... - smull2 v11.4S, v3.8H, v11.8H // .......................................................*................... - ldr q21, [x5, #-16] // ........................................................*.................. - smlal v12.4S, v10.4H, v17.4H // .........................................................*................. - smlal2 v11.4S, v10.8H, v17.8H // ..........................................................*................ - uzp2 v29.8H, v7.8H, v21.8H // ...........................................................*............... - uzp1 v15.8H, v7.8H, v21.8H // ............................................................*.............. - smlal v12.4S, v13.4H, v29.4H // .............................................................*............. - smlal2 v11.4S, v13.8H, v29.8H // ..............................................................*............ - uzp2 v28.8H, v9.8H, v6.8H // ...............................................................*........... - smlal2 v26.4S, v23.8H, v0.8H // ..................................................*........................ - smlal v12.4S, v28.4H, v15.4H // .................................................................*......... - smlal2 v11.4S, v28.8H, v15.8H // ..................................................................*........ - smlal v19.4S, v23.4H, v0.4H // ................................................................*.......... - uzp2 v27.8H, v25.8H, v5.8H // ..........................................*................................ - smull v23.4S, v3.4H, v17.4H // ......................................................................*.... - uzp1 v9.8H, v12.8H, v11.8H // .....................................................................*..... - uzp2 v19.8H, v19.8H, v26.8H // ....................................................................*...... - mul v14.8H, v9.8H, v2.8H // .......................................................................*... - ld1 {v22.8H}, [x6], #16 // ...................................................................*....... - zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - smlal2 v11.4S, v14.8H, v0.8H // ..........................................................................* - ld1 {v4.8H}, [x3], #16 // .........................................................................*. - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q18, [x4], #32 // *.......................................................................... - // ldr q30, [x5], #32 // ..*........................................................................ - // ldr q8, [x2], #32 // .....*..................................................................... - // ldr q9, [x2, #-16] // .......*................................................................... - // uzp1 v17.8H, v8.8H, v9.8H // ..........*................................................................ - // uzp2 v4.8H, v8.8H, v9.8H // ...........*............................................................... - // ldr q19, [x4, #-16] // .*......................................................................... - // ldr q29, [x1], #32 // ............*.............................................................. - // ldr q12, [x1, #-16] // .............*............................................................. - // uzp1 v13.8H, v18.8H, v19.8H // ...*....................................................................... - // uzp1 v3.8H, v29.8H, v12.8H // ...............*........................................................... - // uzp2 v10.8H, v29.8H, v12.8H // ................*.......................................................... - // smull v12.4S, v3.4H, v4.4H // .................*......................................................... - // smull2 v11.4S, v3.8H, v4.8H // ..................*........................................................ - // ldr q5, [x5, #-16] // ......*.................................................................... - // smlal v12.4S, v10.4H, v17.4H // .....................*..................................................... - // smlal2 v11.4S, v10.8H, v17.8H // ......................*.................................................... - // uzp2 v14.8H, v30.8H, v5.8H // ........*.................................................................. - // uzp1 v15.8H, v30.8H, v5.8H // .........*................................................................. - // smlal v12.4S, v13.4H, v14.4H // .........................*................................................. - // smlal2 v11.4S, v13.8H, v14.8H // ..........................*................................................ - // uzp2 v28.8H, v18.8H, v19.8H // ....*...................................................................... - // smlal v12.4S, v28.4H, v15.4H // ..............................*............................................ - // smlal2 v11.4S, v28.8H, v15.8H // ...............................*........................................... - // ld1 {v22.8H}, [x6], #16 // .............................*............................................. - // uzp1 v1.8H, v12.8H, v11.8H // ...................................*....................................... - // smull v23.4S, v3.4H, v17.4H // ...................*....................................................... - // mul v14.8H, v1.8H, v2.8H // .....................................*..................................... - // ld1 {v4.8H}, [x3], #16 // ..............*............................................................ - // smlal2 v11.4S, v14.8H, v0.8H // ........................................*.................................. - // smull2 v20.4S, v3.8H, v17.8H // ....................*...................................................... - // ldr q18, [x4], #32 // ..................................*........................................ - // ldr q30, [x5], #32 // .......................................*................................... - // smlal2 v20.4S, v10.8H, v4.8H // ........................*.................................................. - // smlal v12.4S, v14.4H, v0.4H // .........................................*................................. - // smlal v23.4S, v10.4H, v4.4H // .......................*................................................... - // smlal2 v20.4S, v13.8H, v15.8H // ............................*.............................................. - // ldr q8, [x2], #32 // ..........................................*................................ - // smlal v23.4S, v13.4H, v15.4H // ...........................*............................................... - // smlal2 v20.4S, v28.8H, v22.8H // ................................*.......................................... - // ldr q9, [x2, #-16] // ...........................................*............................... - // smlal v23.4S, v28.4H, v22.4H // .................................*......................................... - // uzp2 v27.8H, v12.8H, v11.8H // ..................................................................*........ - // uzp1 v17.8H, v8.8H, v9.8H // .............................................*............................. - // uzp2 v4.8H, v8.8H, v9.8H // ................................................*.......................... - // uzp1 v5.8H, v23.8H, v20.8H // ....................................*...................................... - // mul v31.8H, v5.8H, v2.8H // ......................................*.................................... - // ldr q19, [x4, #-16] // ............................................*.............................. - // ldr q29, [x1], #32 // ..............................................*............................ - // ldr q12, [x1, #-16] // ...............................................*........................... - // smlal2 v20.4S, v31.8H, v0.8H // ..............................................................*............ - // uzp1 v13.8H, v18.8H, v19.8H // .................................................*......................... - // uzp1 v3.8H, v29.8H, v12.8H // ..................................................*........................ - // uzp2 v10.8H, v29.8H, v12.8H // ...................................................*....................... - // smull v12.4S, v3.4H, v4.4H // ....................................................*...................... - // smull2 v11.4S, v3.8H, v4.8H // .....................................................*..................... - // ldr q5, [x5, #-16] // ......................................................*.................... - // smlal v12.4S, v10.4H, v17.4H // .......................................................*................... - // smlal2 v11.4S, v10.8H, v17.8H // ........................................................*.................. - // uzp2 v14.8H, v30.8H, v5.8H // .........................................................*................. - // uzp1 v15.8H, v30.8H, v5.8H // ..........................................................*................ - // smlal v12.4S, v13.4H, v14.4H // ...........................................................*............... - // smlal2 v11.4S, v13.8H, v14.8H // ............................................................*.............. - // uzp2 v28.8H, v18.8H, v19.8H // .............................................................*............. - // smlal v23.4S, v31.4H, v0.4H // .................................................................*......... - // smlal v12.4S, v28.4H, v15.4H // ...............................................................*........... - // smlal2 v11.4S, v28.8H, v15.8H // ................................................................*.......... - // ld1 {v22.8H}, [x6], #16 // .......................................................................*... - // uzp2 v19.8H, v23.8H, v20.8H // .....................................................................*..... - // uzp1 v1.8H, v12.8H, v11.8H // ....................................................................*...... - // smull v23.4S, v3.4H, v17.4H // ...................................................................*....... - // mul v14.8H, v1.8H, v2.8H // ......................................................................*.... - // zip2 v9.8H, v19.8H, v27.8H // ........................................................................*.. - // ld1 {v4.8H}, [x3], #16 // ..........................................................................* - // smlal2 v11.4S, v14.8H, v0.8H // .........................................................................*. - - sub count, count, #2 -1: - // Instructions: 48 - // Expected cycles: 58 - // Expected IPC: 0.83 - - // Cycle bound: 58.0 - // IPC bound: 0.83 - - // Wall time: 6.39s - // User time: 6.39s - - // -------------- original position --------------> - // 0 25 - // |------------------------|---------------------- - smull2 v20.4S, v3.8H, v17.8H // ..........*..................................... - ldr q18, [x4], #32 // .................e.............................. - ldr q30, [x5], #32 // .....................e.......................... - smlal2 v20.4S, v10.8H, v4.8H // ............*................................... - smlal v12.4S, v14.4H, v0.4H // .........................................*...... - smlal v23.4S, v10.4H, v4.4H // ...........*.................................... - str q9, [x0, #16] // ...............................................l - smlal2 v20.4S, v13.8H, v15.8H // ...........................*.................... - ldr q8, [x2], #32 // ....e........................................... - smlal v23.4S, v13.4H, v15.4H // ..........................*..................... - smlal2 v20.4S, v28.8H, v22.8H // .............................*.................. - zip1 v26.8H, v19.8H, v27.8H // ............................................l... - ldr q9, [x2, #-16] // .....e.......................................... - smlal v23.4S, v28.4H, v22.4H // ............................*................... - uzp2 v27.8H, v12.8H, v11.8H // ...........................................*.... - uzp1 v17.8H, v8.8H, v9.8H // ......e......................................... - uzp2 v4.8H, v8.8H, v9.8H // .......e........................................ - uzp1 v5.8H, v23.8H, v20.8H // ..................................*............. - str q26, [x0], #32 // ..............................................l. - mul v31.8H, v5.8H, v2.8H // ...................................*............ - ldr q19, [x4, #-16] // ..................e............................. - ldr q29, [x1], #32 // e............................................... - ldr q12, [x1, #-16] // .e.............................................. - smlal2 v20.4S, v31.8H, v0.8H // .....................................*.......... - uzp1 v13.8H, v18.8H, v19.8H // ...................e............................ - uzp1 v3.8H, v29.8H, v12.8H // ..e............................................. - uzp2 v10.8H, v29.8H, v12.8H // ...e............................................ - smull v12.4S, v3.4H, v4.4H // .............e.................................. - smull2 v11.4S, v3.8H, v4.8H // ..............e................................. - ldr q5, [x5, #-16] // ......................e......................... - smlal v12.4S, v10.4H, v17.4H // ...............e................................ - smlal2 v11.4S, v10.8H, v17.8H // ................e............................... - uzp2 v14.8H, v30.8H, v5.8H // ........................e....................... - uzp1 v15.8H, v30.8H, v5.8H // .......................e........................ - smlal v12.4S, v13.4H, v14.4H // ..............................e................. - smlal2 v11.4S, v13.8H, v14.8H // ...............................e................ - uzp2 v28.8H, v18.8H, v19.8H // ....................e........................... - smlal v23.4S, v31.4H, v0.4H // ....................................*........... - smlal v12.4S, v28.4H, v15.4H // ................................e............... - smlal2 v11.4S, v28.8H, v15.8H // .................................e.............. - ld1 {v22.8H}, [x6], #16 // .........................e...................... - uzp2 v19.8H, v23.8H, v20.8H // ......................................*......... - uzp1 v1.8H, v12.8H, v11.8H // .......................................e........ - smull v23.4S, v3.4H, v17.4H // .........e...................................... - mul v14.8H, v1.8H, v2.8H // ........................................e....... - zip2 v9.8H, v19.8H, v27.8H // .............................................*.. - ld1 {v4.8H}, [x3], #16 // ........e....................................... - smlal2 v11.4S, v14.8H, v0.8H // ..........................................e..... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q12, [x1], #32 // ....................e..........................'....................~..........................'.................. - // ldr q13, [x1, #-16] // .....................e.........................'.....................~.........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................e......................'........................~......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // .........................e.....................'.........................~.....................'.................. - // ldr q12, [x2], #32 // .......e.......................................'.......~.......................................'.......~.......... - // ldr q13, [x2, #-16] // ...........e...................................'...........~...................................'...........~...... - // uzp1 v5.8h, v12.8h, v13.8h // ..............e................................'..............~................................'..............~... - // uzp2 v6.8h, v12.8h, v13.8h // ...............e...............................'...............~...............................'...............~.. - // ld1 {v7.8h}, [x3], #16 // .............................................e.'.............................................~.'.................. - // smull v8.4s, v3.4h, v5.4h // ..........................................e....'..........................................~....'.................. - // smull2 v10.4s, v3.8h, v5.8h // ...............................................*...............................................~.................. - // smlal v8.4s, v4.4h, v7.4h // ....~..........................................'....*..........................................'....~............. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................'..*............................................'..~............... - // smull v9.4s, v3.4h, v6.4h // ..........................e....................'..........................~....................'.................. - // smull2 v11.4s, v3.8h, v6.8h // ...........................e...................'...........................~...................'.................. - // smlal v9.4s, v4.4h, v5.4h // .............................e.................'.............................~.................'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ..............................e................'..............................~................'.................. - // ldr q12, [x4], #32 // e..............................................'~..............................................'~................. - // ldr q13, [x4, #-16] // ...................e...........................'...................~...........................'.................. - // uzp1 v3.8h, v12.8h, v13.8h // .......................e.......................'.......................~.......................'.................. - // uzp2 v4.8h, v12.8h, v13.8h // ...................................e...........'...................................~...........'.................. - // ldr q12, [x5], #32 // .e.............................................'.~.............................................'.~................ - // ldr q13, [x5, #-16] // ............................e..................'............................~..................'.................. - // uzp1 v5.8h, v12.8h, v13.8h // ................................e..............'................................~..............'.................. - // uzp2 v6.8h, v12.8h, v13.8h // ...............................e...............'...............................~...............'.................. - // ld1 {v7.8h}, [x6], #16 // .......................................e.......'.......................................~.......'.................. - // smlal v8.4s, v3.4h, v5.4h // ........~......................................'........*......................................'........~......... - // smlal2 v10.4s, v3.8h, v5.8h // ......~........................................'......*........................................'......~........... - // smlal v8.4s, v4.4h, v7.4h // ............~..................................'............*..................................'............~..... - // smlal2 v10.4s, v4.8h, v7.8h // .........~.....................................'.........*.....................................'.........~........ - // smlal v9.4s, v3.4h, v6.4h // .................................e.............'.................................~.............'.................. - // smlal2 v11.4s, v3.8h, v6.8h // ..................................e............'..................................~............'.................. - // smlal v9.4s, v4.4h, v5.4h // .....................................e.........'.....................................~.........'.................. - // smlal2 v11.4s, v4.8h, v5.8h // ......................................e........'......................................~........'.................. - // uzp1 v28.8h, v8.8h, v10.8h // ................~..............................'................*..............................'................~. - // mul v28.8h, v28.8h, v2.8h // ..................~............................'..................*............................'.................. - // smlal v8.4s, v28.4h, v0.4h // ....................................~..........'....................................*..........'.................. - // smlal2 v10.4s, v28.8h, v0.8h // ......................~........................'......................*........................'.................. - // uzp2 v26.8h, v8.8h, v10.8h // ........................................~......'........................................*......'.................. - // uzp1 v28.8h, v9.8h, v11.8h // .........................................e.....'.........................................~.....'.................. - // mul v28.8h, v28.8h, v2.8h // ...........................................e...'...........................................~...'.................. - // smlal v9.4s, v28.4h, v0.4h // ...~...........................................'...*...........................................'...~.............. - // smlal2 v11.4s, v28.8h, v0.8h // ..............................................e'..............................................~'.................. - // uzp2 v27.8h, v9.8h, v11.8h // .............~.................................'.............*.................................'.............~.... - // zip1 v12.8h, v26.8h, v27.8h // ..........~....................................'..........~....................................'..........l....... - // zip2 v13.8h, v26.8h, v27.8h // ............................................~..'............................................*..'.................. - // str q12, [x0], #32 // .................~.............................'.................~.............................'.................l - // str q13, [x0, #-16] // .....~.........................................'.....~.........................................'.....l............ - - sub count, count, #1 - cbnz count, 1b - // Instructions: 21 - // Expected cycles: 35 - // Expected IPC: 0.60 - - // Cycle bound: 35.0 - // IPC bound: 0.60 - - // Wall time: 0.08s - // User time: 0.08s - - // ----- original position -----> - // 0 25 - // |------------------------|---- - smull2 v5.4S, v3.8H, v17.8H // *............................. - smlal v12.4S, v14.4H, v0.4H // ..*........................... - smlal v23.4S, v10.4H, v4.4H // ...*.......................... - str q9, [x0, #16] // ....*......................... - smlal2 v5.4S, v10.8H, v4.8H // .*............................ - uzp2 v11.8H, v12.8H, v11.8H // ..........*................... - zip1 v9.8H, v19.8H, v27.8H // ........*..................... - smlal v23.4S, v13.4H, v15.4H // ......*....................... - smlal2 v5.4S, v13.8H, v15.8H // .....*........................ - str q9, [x0], #32 // ............*................. - smlal v23.4S, v28.4H, v22.4H // .........*.................... - smlal2 v5.4S, v28.8H, v22.8H // .......*...................... - uzp1 v9.8H, v23.8H, v5.8H // ...........*.................. - mul v9.8H, v9.8H, v2.8H // .............*................ - smlal2 v5.4S, v9.8H, v0.8H // ..............*............... - smlal v23.4S, v9.4H, v0.4H // ...............*.............. - uzp2 v9.8H, v23.8H, v5.8H // ................*............. - zip2 v5.8H, v9.8H, v11.8H // .................*............ - zip1 v9.8H, v9.8H, v11.8H // ...................*.......... - str q5, [x0, #16] // ..................*........... - str q9, [x0], #32 // ....................*......... - - // -------- new position --------> - // 0 25 - // |------------------------|----- - // smull2 v20.4S, v3.8H, v17.8H // *.............................. - // smlal2 v20.4S, v10.8H, v4.8H // ....*.......................... - // smlal v12.4S, v14.4H, v0.4H // .*............................. - // smlal v23.4S, v10.4H, v4.4H // ..*............................ - // str q9, [x0, #16] // ...*........................... - // smlal2 v20.4S, v13.8H, v15.8H // ........*...................... - // smlal v23.4S, v13.4H, v15.4H // .......*....................... - // smlal2 v20.4S, v28.8H, v22.8H // ...........*................... - // zip1 v26.8H, v19.8H, v27.8H // ......*........................ - // smlal v23.4S, v28.4H, v22.4H // ..........*.................... - // uzp2 v27.8H, v12.8H, v11.8H // .....*......................... - // uzp1 v5.8H, v23.8H, v20.8H // ............*.................. - // str q26, [x0], #32 // .........*..................... - // mul v31.8H, v5.8H, v2.8H // .............*................. - // smlal2 v20.4S, v31.8H, v0.8H // ..............*................ - // smlal v23.4S, v31.4H, v0.4H // ...............*............... - // uzp2 v19.8H, v23.8H, v20.8H // ................*.............. - // zip2 v9.8H, v19.8H, v27.8H // .................*............. - // str q9, [x0, #16] // ...................*........... - // zip1 v26.8H, v19.8H, v27.8H // ..................*............ - // str q26, [x0], #32 // ....................*.......... - - - pop_stack - ret -#endif /* MLKEM_K == 2 */ - -#if MLKEM_K == 3 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - - mov count, #(MLKEM_N / 16) - // Instructions: 75 - // Expected cycles: 103 - // Expected IPC: 0.73 - - // Cycle bound: 103.0 - // IPC bound: 0.73 - - // Wall time: 0.94s - // User time: 0.94s - - // --------------------------- original position ----------------------------> - // 0 25 50 - // |------------------------|------------------------| - ldr q7, [x2, #16] // *.......................................................................... - ldr q20, [x2], #32 // ..*........................................................................ - ldr q15, [x1, #16] // .*......................................................................... - uzp1 v8.8H, v20.8H, v7.8H // ...............*........................................................... - uzp2 v7.8H, v20.8H, v7.8H // ................*.......................................................... - ld1 {v20.8H}, [x3], #16 // ...*....................................................................... - ldr q30, [x1], #32 // ..............*............................................................ - ldr q11, [x4], #32 // ....*...................................................................... - uzp1 v16.8H, v30.8H, v15.8H // .................*......................................................... - uzp2 v15.8H, v30.8H, v15.8H // ..................*........................................................ - smull v30.4S, v16.4H, v7.4H // ...................*....................................................... - smull2 v7.4S, v16.8H, v7.8H // ....................*...................................................... - smull v9.4S, v16.4H, v8.4H // .....................*..................................................... - smull2 v16.4S, v16.8H, v8.8H // ......................*.................................................... - smlal v30.4S, v15.4H, v8.4H // .......................*................................................... - smlal2 v7.4S, v15.8H, v8.8H // ........................*.................................................. - smlal v9.4S, v15.4H, v20.4H // .........................*................................................. - smlal2 v16.4S, v15.8H, v20.8H // ..........................*................................................ - ldr q20, [x4, #-16] // .....*..................................................................... - ldr q15, [x5], #32 // ......*.................................................................... - uzp1 v8.8H, v11.8H, v20.8H // ...........................*............................................... - uzp2 v20.8H, v11.8H, v20.8H // ............................*.............................................. - ldr q11, [x5, #-16] // .......*................................................................... - ld1 {v27.8H}, [x6], #16 // ........*.................................................................. - uzp1 v10.8H, v15.8H, v11.8H // .............................*............................................. - uzp2 v15.8H, v15.8H, v11.8H // ..............................*............................................ - smlal v9.4S, v8.4H, v10.4H // ...............................*........................................... - smlal2 v16.4S, v8.8H, v10.8H // ................................*.......................................... - smlal v30.4S, v8.4H, v15.4H // .................................*......................................... - smlal2 v7.4S, v8.8H, v15.8H // ..................................*........................................ - smlal v9.4S, v20.4H, v27.4H // ...................................*....................................... - smlal2 v16.4S, v20.8H, v27.8H // ....................................*...................................... - smlal v30.4S, v20.4H, v10.4H // .....................................*..................................... - smlal2 v7.4S, v20.8H, v10.8H // ......................................*.................................... - ldr q20, [x7], #32 // .........*................................................................. - ldr q15, [x7, #-16] // ..........*................................................................ - ldr q8, [x8], #32 // ...........*............................................................... - uzp1 v11.8H, v20.8H, v15.8H // .......................................*................................... - uzp2 v20.8H, v20.8H, v15.8H // ........................................*.................................. - ldr q15, [x8, #-16] // ............*.............................................................. - ld1 {v27.8H}, [x9], #16 // .............*............................................................. - uzp1 v10.8H, v8.8H, v15.8H // .........................................*................................. - uzp2 v15.8H, v8.8H, v15.8H // ..........................................*................................ - smlal v9.4S, v11.4H, v10.4H // ...........................................*............................... - smlal2 v16.4S, v11.8H, v10.8H // ............................................*.............................. - smlal v30.4S, v11.4H, v15.4H // .............................................*............................. - smlal2 v7.4S, v11.8H, v15.8H // ..............................................*............................ - smlal v9.4S, v20.4H, v27.4H // ...............................................*........................... - smlal2 v16.4S, v20.8H, v27.8H // ................................................*.......................... - smlal v30.4S, v20.4H, v10.4H // .................................................*......................... - smlal2 v7.4S, v20.8H, v10.8H // ..................................................*........................ - ldr q15, [x2], #32 // ...............................................................*........... - uzp1 v20.8H, v9.8H, v16.8H // ....................................................*...................... - uzp1 v8.8H, v30.8H, v7.8H // .....................................................*..................... - mul v20.8H, v20.8H, v2.8H // ......................................................*.................... - mul v8.8H, v8.8H, v2.8H // .......................................................*................... - ldr q21, [x4], #32 // .................................................................*......... - smlal v9.4S, v20.4H, v0.4H // ........................................................*.................. - smlal2 v16.4S, v20.8H, v0.8H // .........................................................*................. - smlal v30.4S, v8.4H, v0.4H // ..........................................................*................ - smlal2 v7.4S, v8.8H, v0.8H // ...........................................................*............... - ldr q6, [x4, #-16] // ..................................................................*........ - uzp2 v27.8H, v9.8H, v16.8H // ............................................................*.............. - uzp2 v10.8H, v30.8H, v7.8H // .............................................................*............. - ldr q16, [x2, #-16] // ...................................................*....................... - ldr q30, [x1, #16] // ..............................................................*............ - ld1 {v9.8H}, [x3], #16 // ................................................................*.......... - ldr q1, [x5], #32 // ...................................................................*....... - ldr q12, [x5, #-16] // ....................................................................*...... - ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - ldr q19, [x7], #32 // ......................................................................*.... - ldr q31, [x7, #-16] // .......................................................................*... - ldr q17, [x8], #32 // ........................................................................*.. - ldr q18, [x8, #-16] // .........................................................................*. - ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - // ------------------------------ new position ------------------------------> - // 0 25 50 - // |------------------------|------------------------|------------------------ - // ldr q16, [x2, #16] // *.......................................................................... - // ldr q30, [x1, #16] // ..*........................................................................ - // ldr q15, [x2], #32 // .*......................................................................... - // ld1 {v9.8H}, [x3], #16 // .....*..................................................................... - // ldr q21, [x4], #32 // .......*................................................................... - // ldr q6, [x4, #-16] // ..................*........................................................ - // ldr q1, [x5], #32 // ...................*....................................................... - // ldr q12, [x5, #-16] // ......................*.................................................... - // ld1 {v24.8H}, [x6], #16 // .......................*................................................... - // ldr q19, [x7], #32 // ..................................*........................................ - // ldr q31, [x7, #-16] // ...................................*....................................... - // ldr q17, [x8], #32 // ....................................*...................................... - // ldr q18, [x8, #-16] // .......................................*................................... - // ld1 {v25.8H}, [x9], #16 // ........................................*.................................. - // ldr q20, [x1], #32 // ......*.................................................................... - // uzp1 v7.8H, v15.8H, v16.8H // ...*....................................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ....*...................................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ........*.................................................................. - // uzp2 v20.8H, v20.8H, v30.8H // .........*................................................................. - // smull v30.4S, v8.4H, v15.4H // ..........*................................................................ - // smull2 v15.4S, v8.8H, v15.8H // ...........*............................................................... - // smull v11.4S, v8.4H, v7.4H // ............*.............................................................. - // smull2 v8.4S, v8.8H, v7.8H // .............*............................................................. - // smlal v30.4S, v20.4H, v7.4H // ..............*............................................................ - // smlal2 v15.4S, v20.8H, v7.8H // ...............*........................................................... - // smlal v11.4S, v20.4H, v9.4H // ................*.......................................................... - // smlal2 v8.4S, v20.8H, v9.8H // .................*......................................................... - // uzp1 v7.8H, v21.8H, v6.8H // ....................*...................................................... - // uzp2 v20.8H, v21.8H, v6.8H // .....................*..................................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........................*.................................................. - // uzp2 v9.8H, v1.8H, v12.8H // .........................*................................................. - // smlal v11.4S, v7.4H, v16.4H // ..........................*................................................ - // smlal2 v8.4S, v7.8H, v16.8H // ...........................*............................................... - // smlal v30.4S, v7.4H, v9.4H // ............................*.............................................. - // smlal2 v15.4S, v7.8H, v9.8H // .............................*............................................. - // smlal v11.4S, v20.4H, v24.4H // ..............................*............................................ - // smlal2 v8.4S, v20.8H, v24.8H // ...............................*........................................... - // smlal v30.4S, v20.4H, v16.4H // ................................*.......................................... - // smlal2 v15.4S, v20.8H, v16.8H // .................................*......................................... - // uzp1 v7.8H, v19.8H, v31.8H // .....................................*..................................... - // uzp2 v20.8H, v19.8H, v31.8H // ......................................*.................................... - // uzp1 v16.8H, v17.8H, v18.8H // .........................................*................................. - // uzp2 v9.8H, v17.8H, v18.8H // ..........................................*................................ - // smlal v11.4S, v7.4H, v16.4H // ...........................................*............................... - // smlal2 v8.4S, v7.8H, v16.8H // ............................................*.............................. - // smlal v30.4S, v7.4H, v9.4H // .............................................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ..............................................*............................ - // smlal v11.4S, v20.4H, v25.4H // ...............................................*........................... - // smlal2 v8.4S, v20.8H, v25.8H // ................................................*.......................... - // smlal v30.4S, v20.4H, v16.4H // .................................................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ..................................................*........................ - // ldr q16, [x2, #16] // ................................................................*.......... - // uzp1 v7.8H, v11.8H, v8.8H // ....................................................*...................... - // uzp1 v20.8H, v30.8H, v15.8H // .....................................................*..................... - // mul v7.8H, v7.8H, v2.8H // ......................................................*.................... - // mul v20.8H, v20.8H, v2.8H // .......................................................*................... - // smlal v11.4S, v7.4H, v0.4H // .........................................................*................. - // smlal2 v8.4S, v7.8H, v0.8H // ..........................................................*................ - // smlal v30.4S, v20.4H, v0.4H // ...........................................................*............... - // smlal2 v15.4S, v20.8H, v0.8H // ............................................................*.............. - // uzp2 v27.8H, v11.8H, v8.8H // ..............................................................*............ - // uzp2 v10.8H, v30.8H, v15.8H // ...............................................................*........... - // ldr q30, [x1, #16] // .................................................................*......... - // ldr q15, [x2], #32 // ...................................................*....................... - // ld1 {v9.8H}, [x3], #16 // ..................................................................*........ - // ldr q21, [x4], #32 // ........................................................*.................. - // ldr q6, [x4, #-16] // .............................................................*............. - // ldr q1, [x5], #32 // ...................................................................*....... - // ldr q12, [x5, #-16] // ....................................................................*...... - // ld1 {v24.8H}, [x6], #16 // .....................................................................*..... - // ldr q19, [x7], #32 // ......................................................................*.... - // ldr q31, [x7, #-16] // .......................................................................*... - // ldr q17, [x8], #32 // ........................................................................*.. - // ldr q18, [x8, #-16] // .........................................................................*. - // ld1 {v25.8H}, [x9], #16 // ..........................................................................* - - sub count, count, #2 -1: - // Instructions: 65 - // Expected cycles: 80 - // Expected IPC: 0.81 - - // Cycle bound: 80.0 - // IPC bound: 0.81 - - // Wall time: 11.64s - // User time: 11.64s - - // ---------------------- original position -----------------------> - // 0 25 50 - // |------------------------|------------------------|-------------- - ldr q20, [x1], #32 // *................................................................ - uzp1 v7.8H, v15.8H, v16.8H // ......*.......................................................... - uzp2 v15.8H, v15.8H, v16.8H // .......*......................................................... - uzp1 v8.8H, v20.8H, v30.8H // ..*.............................................................. - uzp2 v20.8H, v20.8H, v30.8H // ...*............................................................. - smull v30.4S, v8.4H, v15.4H // .............*................................................... - smull2 v15.4S, v8.8H, v15.8H // ..............*.................................................. - smull v11.4S, v8.4H, v7.4H // .........*....................................................... - smull2 v8.4S, v8.8H, v7.8H // ..........*...................................................... - smlal v30.4S, v20.4H, v7.4H // ...............*................................................. - smlal2 v15.4S, v20.8H, v7.8H // ................*................................................ - smlal v11.4S, v20.4H, v9.4H // ...........*..................................................... - smlal2 v8.4S, v20.8H, v9.8H // ............*.................................................... - uzp1 v7.8H, v21.8H, v6.8H // ...................*............................................. - uzp2 v20.8H, v21.8H, v6.8H // ....................*............................................ - uzp1 v16.8H, v1.8H, v12.8H // .......................*......................................... - uzp2 v9.8H, v1.8H, v12.8H // ........................*........................................ - smlal v11.4S, v7.4H, v16.4H // ..........................*...................................... - smlal2 v8.4S, v7.8H, v16.8H // ...........................*..................................... - smlal v30.4S, v7.4H, v9.4H // ..............................*.................................. - smlal2 v15.4S, v7.8H, v9.8H // ...............................*................................. - smlal v11.4S, v20.4H, v24.4H // ............................*.................................... - smlal2 v8.4S, v20.8H, v24.8H // .............................*................................... - smlal v30.4S, v20.4H, v16.4H // ................................*................................ - smlal2 v15.4S, v20.8H, v16.8H // .................................*............................... - uzp1 v7.8H, v19.8H, v31.8H // ....................................*............................ - uzp2 v20.8H, v19.8H, v31.8H // .....................................*........................... - uzp1 v16.8H, v17.8H, v18.8H // ........................................*........................ - uzp2 v9.8H, v17.8H, v18.8H // .........................................*....................... - smlal v11.4S, v7.4H, v16.4H // ...........................................*..................... - smlal2 v8.4S, v7.8H, v16.8H // ............................................*.................... - smlal v30.4S, v7.4H, v9.4H // ...............................................*................. - smlal2 v15.4S, v7.8H, v9.8H // ................................................*................ - smlal v11.4S, v20.4H, v25.4H // .............................................*................... - smlal2 v8.4S, v20.8H, v25.8H // ..............................................*.................. - smlal v30.4S, v20.4H, v16.4H // .................................................*............... - smlal2 v15.4S, v20.8H, v16.8H // ..................................................*.............. - ldr q16, [x2, #16] // .....e........................................................... - uzp1 v7.8H, v11.8H, v8.8H // ...................................................*............. - uzp1 v20.8H, v30.8H, v15.8H // ........................................................*........ - mul v7.8H, v7.8H, v2.8H // ....................................................*............ - mul v20.8H, v20.8H, v2.8H // .........................................................*....... - zip2 v9.8H, v27.8H, v10.8H // ..............................................................l.. - zip1 v27.8H, v27.8H, v10.8H // .............................................................l... - smlal v11.4S, v7.4H, v0.4H // .....................................................*........... - smlal2 v8.4S, v7.8H, v0.8H // ......................................................*.......... - smlal v30.4S, v20.4H, v0.4H // ..........................................................*...... - smlal2 v15.4S, v20.8H, v0.8H // ...........................................................*..... - str q27, [x0], #32 // ...............................................................l. - uzp2 v27.8H, v11.8H, v8.8H // .......................................................*......... - str q9, [x0, #-16] // ................................................................l - uzp2 v10.8H, v30.8H, v15.8H // ............................................................*.... - ldr q30, [x1, #16] // .e............................................................... - ldr q15, [x2], #32 // ....e............................................................ - ld1 {v9.8H}, [x3], #16 // ........e........................................................ - ldr q21, [x4], #32 // .................e............................................... - ldr q6, [x4, #-16] // ..................e.............................................. - ldr q1, [x5], #32 // .....................e........................................... - ldr q12, [x5, #-16] // ......................e.......................................... - ld1 {v24.8H}, [x6], #16 // .........................e....................................... - ldr q19, [x7], #32 // ..................................e.............................. - ldr q31, [x7, #-16] // ...................................e............................. - ldr q17, [x8], #32 // ......................................e.......................... - ldr q18, [x8, #-16] // .......................................e......................... - ld1 {v25.8H}, [x9], #16 // ..........................................e...................... - - // ---------------------------------------------------------------- new position -----------------------------------------------------------------> - // 0 25 50 75 100 125 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------ - // ldr q12, [x1], #32 // ............................*................................................................~.................................................. - // ldr q13, [x1, #-16] // ...............e............'...................................................~............'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'..*.............................................................'..~............................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'...*............................................................'...~.............................................. - // ldr q12, [x2], #32 // ................e...........'....................................................~...........'.................................................. - // ldr q13, [x2, #-16] // e...........................'....................................~...........................'....................................~............. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'*...............................................................'~................................................. - // uzp2 v6.8h, v12.8h, v13.8h // ............................'.*..............................................................'.~................................................ - // ld1 {v7.8h}, [x3], #16 // .................e..........'.....................................................~..........'.................................................. - // smull v8.4s, v3.4h, v5.4h // ............................'......*.........................................................'......~........................................... - // smull2 v10.4s, v3.8h, v5.8h // ............................'.......*........................................................'.......~.......................................... - // smlal v8.4s, v4.4h, v7.4h // ............................'..........*.....................................................'..........~....................................... - // smlal2 v10.4s, v4.8h, v7.8h // ............................'...........*....................................................'...........~...................................... - // smull v9.4s, v3.4h, v6.4h // ............................'....*...........................................................'....~............................................. - // smull2 v11.4s, v3.8h, v6.8h // ............................'.....*..........................................................'.....~............................................ - // smlal v9.4s, v4.4h, v5.4h // ............................'........*.......................................................'........~......................................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.........*......................................................'.........~........................................ - // ldr q12, [x4], #32 // ..................e.........'......................................................~.........'.................................................. - // ldr q13, [x4, #-16] // ...................e........'.......................................................~........'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'............*...................................................'............~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.............*..................................................'.............~.................................... - // ldr q12, [x5], #32 // ....................e.......'........................................................~.......'.................................................. - // ldr q13, [x5, #-16] // .....................e......'.........................................................~......'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..............*.................................................'..............~................................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...............*................................................'...............~.................................. - // ld1 {v7.8h}, [x6], #16 // ......................e.....'..........................................................~.....'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'................*...............................................'................~................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.................*..............................................'.................~................................ - // smlal v8.4s, v4.4h, v7.4h // ............................'....................*...........................................'....................~............................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.....................*..........................................'.....................~............................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..................*.............................................'..................~............................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...................*............................................'...................~.............................. - // smlal v9.4s, v4.4h, v5.4h // ............................'......................*.........................................'......................~........................... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'.......................*........................................'.......................~.......................... - // ldr q12, [x7], #32 // .......................e....'...........................................................~....'.................................................. - // ldr q13, [x7, #-16] // ........................e...'............................................................~...'.................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ............................'........................*.......................................'........................~......................... - // uzp2 v4.8h, v12.8h, v13.8h // ............................'.........................*......................................'.........................~........................ - // ldr q12, [x8], #32 // .........................e..'.............................................................~..'.................................................. - // ldr q13, [x8, #-16] // ..........................e.'..............................................................~.'.................................................. - // uzp1 v5.8h, v12.8h, v13.8h // ............................'..........................*.....................................'..........................~....................... - // uzp2 v6.8h, v12.8h, v13.8h // ............................'...........................*....................................'...........................~...................... - // ld1 {v7.8h}, [x9], #16 // ...........................e'...............................................................~'.................................................. - // smlal v8.4s, v3.4h, v5.4h // ............................'............................*...................................'............................~..................... - // smlal2 v10.4s, v3.8h, v5.8h // ............................'.............................*..................................'.............................~.................... - // smlal v8.4s, v4.4h, v7.4h // ............................'................................*...............................'................................~................. - // smlal2 v10.4s, v4.8h, v7.8h // ............................'.................................*..............................'.................................~................ - // smlal v9.4s, v3.4h, v6.4h // ............................'..............................*.................................'..............................~................... - // smlal2 v11.4s, v3.8h, v6.8h // ............................'...............................*................................'...............................~.................. - // smlal v9.4s, v4.4h, v5.4h // ............................'..................................*.............................'..................................~............... - // smlal2 v11.4s, v4.8h, v5.8h // ............................'...................................*............................'...................................~.............. - // uzp1 v28.8h, v8.8h, v10.8h // .~..........................'.....................................*..........................'.....................................~............ - // mul v28.8h, v28.8h, v2.8h // ...~........................'.......................................*........................'.......................................~.......... - // smlal v8.4s, v28.4h, v0.4h // .......~....................'...........................................*....................'...........................................~...... - // smlal2 v10.4s, v28.8h, v0.8h // ........~...................'............................................*...................'............................................~..... - // uzp2 v26.8h, v8.8h, v10.8h // ............~...............'................................................*...............'................................................~. - // uzp1 v28.8h, v9.8h, v11.8h // ..~.........................'......................................*.........................'......................................~........... - // mul v28.8h, v28.8h, v2.8h // ....~.......................'........................................*.......................'........................................~......... - // smlal v9.4s, v28.4h, v0.4h // .........~..................'.............................................*..................'.............................................~.... - // smlal2 v11.4s, v28.8h, v0.8h // ..........~.................'..............................................*.................'..............................................~... - // uzp2 v27.8h, v9.8h, v11.8h // ..............~.............'..................................................*.............'.................................................. - // zip1 v12.8h, v26.8h, v27.8h // ......~.....................'..........................................~.....................'..........................................l....... - // zip2 v13.8h, v26.8h, v27.8h // .....~......................'.........................................~......................'.........................................l........ - // str q12, [x0], #32 // ...........~................'...............................................~................'...............................................l.. - // str q13, [x0, #-16] // .............~..............'.................................................~..............'.................................................l - - sub count, count, #1 - cbnz count, 1b - // Instructions: 55 - // Expected cycles: 61 - // Expected IPC: 0.90 - - // Cycle bound: 61.0 - // IPC bound: 0.90 - - // Wall time: 8.41s - // User time: 8.41s - - // ----------------- original position ------------------> - // 0 25 50 - // |------------------------|------------------------|---- - ldr q7, [x1], #32 // *...................................................... - uzp1 v20.8H, v15.8H, v16.8H // .*..................................................... - uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - uzp1 v23.8H, v7.8H, v30.8H // ...*................................................... - uzp2 v11.8H, v7.8H, v30.8H // ....*.................................................. - smull2 v8.4S, v23.8H, v20.8H // ........*.............................................. - smull v5.4S, v23.4H, v20.4H // .......*............................................... - smull2 v30.4S, v23.8H, v15.8H // ......*................................................ - uzp1 v28.8H, v1.8H, v12.8H // ...............*....................................... - smlal2 v8.4S, v11.8H, v9.8H // ............*.......................................... - smlal v5.4S, v11.4H, v9.4H // ...........*........................................... - uzp1 v3.8H, v21.8H, v6.8H // .............*......................................... - smull v16.4S, v23.4H, v15.4H // .....*................................................. - smlal2 v8.4S, v3.8H, v28.8H // ..................*.................................... - smlal v5.4S, v3.4H, v28.4H // .................*..................................... - uzp2 v29.8H, v21.8H, v6.8H // ..............*........................................ - uzp1 v7.8H, v17.8H, v18.8H // ...........................*........................... - smlal2 v8.4S, v29.8H, v24.8H // ......................*................................ - uzp1 v14.8H, v19.8H, v31.8H // .........................*............................. - smlal v16.4S, v11.4H, v20.4H // .........*............................................. - smlal2 v30.4S, v11.8H, v20.8H // ..........*............................................ - smlal2 v8.4S, v14.8H, v7.8H // ..............................*........................ - uzp2 v20.8H, v1.8H, v12.8H // ................*...................................... - uzp2 v21.8H, v19.8H, v31.8H // ..........................*............................ - smlal2 v30.4S, v3.8H, v20.8H // ....................*.................................. - smlal v16.4S, v3.4H, v20.4H // ...................*................................... - smlal v5.4S, v29.4H, v24.4H // .....................*................................. - uzp2 v9.8H, v17.8H, v18.8H // ............................*.......................... - smlal2 v30.4S, v29.8H, v28.8H // ........................*.............................. - smlal v16.4S, v29.4H, v28.4H // .......................*............................... - smlal v5.4S, v14.4H, v7.4H // .............................*......................... - smlal2 v8.4S, v21.8H, v25.8H // ..................................*.................... - smlal2 v30.4S, v14.8H, v9.8H // ................................*...................... - smlal v16.4S, v14.4H, v9.4H // ...............................*....................... - smlal v5.4S, v21.4H, v25.4H // .................................*..................... - zip1 v20.8H, v27.8H, v10.8H // ..........................................*............ - smlal2 v30.4S, v21.8H, v7.8H // ....................................*.................. - smlal v16.4S, v21.4H, v7.4H // ...................................*................... - uzp1 v7.8H, v5.8H, v8.8H // .....................................*................. - str q20, [x0], #32 // ...............................................*....... - mul v15.8H, v7.8H, v2.8H // .......................................*............... - uzp1 v7.8H, v16.8H, v30.8H // ......................................*................ - zip2 v31.8H, v27.8H, v10.8H // .........................................*............. - mul v20.8H, v7.8H, v2.8H // ........................................*.............. - smlal v5.4S, v15.4H, v0.4H // ...........................................*........... - smlal2 v8.4S, v15.8H, v0.8H // ............................................*.......... - str q31, [x0, #-16] // .................................................*..... - smlal2 v30.4S, v20.8H, v0.8H // ..............................................*........ - smlal v16.4S, v20.4H, v0.4H // .............................................*......... - uzp2 v15.8H, v5.8H, v8.8H // ................................................*...... - uzp2 v20.8H, v16.8H, v30.8H // ..................................................*.... - zip1 v7.8H, v15.8H, v20.8H // ....................................................*.. - zip2 v20.8H, v15.8H, v20.8H // ...................................................*... - str q7, [x0], #32 // .....................................................*. - str q20, [x0, #-16] // ......................................................* - - // -------------------- new position --------------------> - // 0 25 50 - // |------------------------|------------------------|---- - // ldr q20, [x1], #32 // *...................................................... - // uzp1 v7.8H, v15.8H, v16.8H // .*..................................................... - // uzp2 v15.8H, v15.8H, v16.8H // ..*.................................................... - // uzp1 v8.8H, v20.8H, v30.8H // ...*................................................... - // uzp2 v20.8H, v20.8H, v30.8H // ....*.................................................. - // smull v30.4S, v8.4H, v15.4H // ............*.......................................... - // smull2 v15.4S, v8.8H, v15.8H // .......*............................................... - // smull v11.4S, v8.4H, v7.4H // ......*................................................ - // smull2 v8.4S, v8.8H, v7.8H // .....*................................................. - // smlal v30.4S, v20.4H, v7.4H // ...................*................................... - // smlal2 v15.4S, v20.8H, v7.8H // ....................*.................................. - // smlal v11.4S, v20.4H, v9.4H // ..........*............................................ - // smlal2 v8.4S, v20.8H, v9.8H // .........*............................................. - // uzp1 v7.8H, v21.8H, v6.8H // ...........*........................................... - // uzp2 v20.8H, v21.8H, v6.8H // ...............*....................................... - // uzp1 v16.8H, v1.8H, v12.8H // ........*.............................................. - // uzp2 v9.8H, v1.8H, v12.8H // ......................*................................ - // smlal v11.4S, v7.4H, v16.4H // ..............*........................................ - // smlal2 v8.4S, v7.8H, v16.8H // .............*......................................... - // smlal v30.4S, v7.4H, v9.4H // .........................*............................. - // smlal2 v15.4S, v7.8H, v9.8H // ........................*.............................. - // smlal v11.4S, v20.4H, v24.4H // ..........................*............................ - // smlal2 v8.4S, v20.8H, v24.8H // .................*..................................... - // smlal v30.4S, v20.4H, v16.4H // .............................*......................... - // smlal2 v15.4S, v20.8H, v16.8H // ............................*.......................... - // uzp1 v7.8H, v19.8H, v31.8H // ..................*.................................... - // uzp2 v20.8H, v19.8H, v31.8H // .......................*............................... - // uzp1 v16.8H, v17.8H, v18.8H // ................*...................................... - // uzp2 v9.8H, v17.8H, v18.8H // ...........................*........................... - // smlal v11.4S, v7.4H, v16.4H // ..............................*........................ - // smlal2 v8.4S, v7.8H, v16.8H // .....................*................................. - // smlal v30.4S, v7.4H, v9.4H // .................................*..................... - // smlal2 v15.4S, v7.8H, v9.8H // ................................*...................... - // smlal v11.4S, v20.4H, v25.4H // ..................................*.................... - // smlal2 v8.4S, v20.8H, v25.8H // ...............................*....................... - // smlal v30.4S, v20.4H, v16.4H // .....................................*................. - // smlal2 v15.4S, v20.8H, v16.8H // ....................................*.................. - // uzp1 v7.8H, v11.8H, v8.8H // ......................................*................ - // uzp1 v20.8H, v30.8H, v15.8H // .........................................*............. - // mul v7.8H, v7.8H, v2.8H // ........................................*.............. - // mul v20.8H, v20.8H, v2.8H // ...........................................*........... - // zip2 v9.8H, v27.8H, v10.8H // ..........................................*............ - // zip1 v27.8H, v27.8H, v10.8H // ...................................*................... - // smlal v11.4S, v7.4H, v0.4H // ............................................*.......... - // smlal2 v8.4S, v7.8H, v0.8H // .............................................*......... - // smlal v30.4S, v20.4H, v0.4H // ................................................*...... - // smlal2 v15.4S, v20.8H, v0.8H // ...............................................*....... - // str q27, [x0], #32 // .......................................*............... - // uzp2 v27.8H, v11.8H, v8.8H // .................................................*..... - // str q9, [x0, #-16] // ..............................................*........ - // uzp2 v10.8H, v30.8H, v15.8H // ..................................................*.... - // zip2 v9.8H, v27.8H, v10.8H // ....................................................*.. - // zip1 v27.8H, v27.8H, v10.8H // ...................................................*... - // str q27, [x0], #32 // .....................................................*. - // str q9, [x0, #-16] // ......................................................* - - - pop_stack - ret -#endif /* MLKEM_K == 3 */ - -#if MLKEM_K == 4 -.global MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt) - -.balign 4 -MLKEM_ASM_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached_asm_opt): - push_stack - mov wtmp, #3329 - dup modulus.8h, wtmp - - mov wtmp, #3327 - dup modulus_twisted.8h, wtmp - - // Computed bases of vector entries - - add a1_ptr, a0_ptr, #(1 * 512) - add b1_ptr, b0_ptr, #(1 * 512) - add b1_cache_ptr, b0_cache_ptr, #(1 * 512/2) - add a2_ptr, a0_ptr, #(2 * 512) - add b2_ptr, b0_ptr, #(2 * 512) - add b2_cache_ptr, b0_cache_ptr, #(2 * 512/2) - add a3_ptr, a0_ptr, #(3 * 512) - add b3_ptr, b0_ptr, #(3 * 512) - add b3_cache_ptr, b0_cache_ptr, #(3 * 512/2) - - // Bounds: - - // Each pmull is bound by 2*4096*2^15=2^28, so the final value - // before Montgomery reduction is bound by 2^30. - - mov count, #(MLKEM_N / 16) - // Instructions: 114 - // Expected cycles: 153 - // Expected IPC: 0.75 - // - // Cycle bound: 153.0 - // IPC bound: 0.75 - // - // Wall time: 0.69s - // User time: 0.69s - // - // ----------------------------------------------- original position -----------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - ldr q23, [x2, #16] // .*................................................................................................................ - ldr q19, [x2], #32 // *................................................................................................................. - ldr q17, [x5], #32 // ..*............................................................................................................... - uzp2 v13.8H, v19.8H, v23.8H // ..........*....................................................................................................... - uzp1 v19.8H, v19.8H, v23.8H // ...........*...................................................................................................... - ldr q23, [x5, #-16] // ...*.............................................................................................................. - ldr q30, [x1, #16] // .....*............................................................................................................ - uzp2 v9.8H, v17.8H, v23.8H // ....*............................................................................................................. - uzp1 v23.8H, v17.8H, v23.8H // .......*.......................................................................................................... - ldr q17, [x1], #32 // ......*........................................................................................................... - ldr q10, [x7, #16] // .............*.................................................................................................... - uzp1 v12.8H, v17.8H, v30.8H // ........*......................................................................................................... - uzp2 v17.8H, v17.8H, v30.8H // .........*........................................................................................................ - smull2 v30.4S, v12.8H, v13.8H // ............*..................................................................................................... - smull v13.4S, v12.4H, v13.4H // ............................................*..................................................................... - smull2 v22.4S, v12.8H, v19.8H // .....................................*............................................................................ - smull v12.4S, v12.4H, v19.4H // ..........................................*....................................................................... - smlal2 v30.4S, v17.8H, v19.8H // ...............................*.................................................................................. - smlal v13.4S, v17.4H, v19.4H // ...............................................*.................................................................. - ldr q19, [x4], #32 // ....................*............................................................................................. - ldr q16, [x4, #-16] // .....................*............................................................................................ - ld1 {v8.8H}, [x3], #16 // ................................*................................................................................. - uzp1 v26.8H, v19.8H, v16.8H // .......................*.......................................................................................... - uzp2 v19.8H, v19.8H, v16.8H // ........................*......................................................................................... - smlal2 v30.4S, v26.8H, v9.8H // .................................*................................................................................ - smlal v13.4S, v26.4H, v9.4H // ..................................................*............................................................... - smlal2 v22.4S, v17.8H, v8.8H // ........................................*......................................................................... - smlal v12.4S, v17.4H, v8.4H // .................................................*................................................................ - smlal2 v30.4S, v19.8H, v23.8H // ...................................*.............................................................................. - smlal v13.4S, v19.4H, v23.4H // .......................................................*.......................................................... - smlal2 v22.4S, v26.8H, v23.8H // ...........................................*...................................................................... - smlal v12.4S, v26.4H, v23.4H // .....................................................*............................................................ - ldr q23, [x7], #32 // ......................*........................................................................................... - ldr q17, [x8, #16] // ..............*................................................................................................... - uzp1 v9.8H, v23.8H, v10.8H // ..........................*....................................................................................... - uzp2 v23.8H, v23.8H, v10.8H // ....................................*............................................................................. - ldr q10, [x10], #32 // ...............*.................................................................................................. - ldr q16, [x10, #-16] // ................*................................................................................................. - ld1 {v8.8H}, [x12], #16 // .................*................................................................................................ - uzp1 v26.8H, v10.8H, v16.8H // ..................*............................................................................................... - uzp2 v10.8H, v10.8H, v16.8H // ...................*.............................................................................................. - ld1 {v16.8H}, [x6], #16 // .........................*........................................................................................ - ldr q3, [x11, #16] // ...........................*...................................................................................... - smlal2 v22.4S, v19.8H, v16.8H // ..............................................*................................................................... - smlal v12.4S, v19.4H, v16.4H // ........................................................*......................................................... - ldr q19, [x11], #32 // ............................*..................................................................................... - ld1 {v16.8H}, [x9], #16 // .............................*.................................................................................... - uzp1 v4.8H, v19.8H, v3.8H // ..................................*............................................................................... - uzp2 v19.8H, v19.8H, v3.8H // .......................................*.......................................................................... - ldr q3, [x8], #32 // ..............................*................................................................................... - ldr q31, [x2], #32 // ......................................*........................................................................... - uzp1 v6.8H, v3.8H, v17.8H // ...................................................*.............................................................. - uzp2 v17.8H, v3.8H, v17.8H // .........................................................*........................................................ - smlal2 v22.4S, v9.8H, v6.8H // ..........................................................*....................................................... - smlal2 v30.4S, v9.8H, v17.8H // ...........................................................*...................................................... - smlal v13.4S, v9.4H, v17.4H // ............................................................*..................................................... - smlal v12.4S, v9.4H, v6.4H // .............................................................*.................................................... - smlal2 v22.4S, v23.8H, v16.8H // ..............................................................*................................................... - smlal2 v30.4S, v23.8H, v6.8H // ...............................................................*.................................................. - smlal v13.4S, v23.4H, v6.4H // ................................................................*................................................. - smlal v12.4S, v23.4H, v16.4H // .................................................................*................................................ - smlal2 v22.4S, v26.8H, v4.8H // ..................................................................*............................................... - smlal2 v30.4S, v26.8H, v19.8H // ...................................................................*.............................................. - smlal v13.4S, v26.4H, v19.4H // ....................................................................*............................................. - smlal v12.4S, v26.4H, v4.4H // .....................................................................*............................................ - smlal2 v22.4S, v10.8H, v8.8H // ......................................................................*........................................... - smlal2 v30.4S, v10.8H, v4.8H // .......................................................................*.......................................... - smlal v13.4S, v10.4H, v4.4H // ........................................................................*......................................... - smlal v12.4S, v10.4H, v8.4H // .........................................................................*........................................ - ldr q19, [x2, #-16] // .........................................*........................................................................ - uzp1 v23.8H, v13.8H, v30.8H // ...........................................................................*...................................... - uzp1 v17.8H, v12.8H, v22.8H // ....................................................................................*............................. - mul v23.8H, v23.8H, v2.8H // .............................................................................*.................................... - uzp2 v21.8H, v31.8H, v19.8H // ................................................................................*................................. - uzp1 v19.8H, v31.8H, v19.8H // ...................................................................................*.............................. - mul v17.8H, v17.8H, v2.8H // .....................................................................................*............................ - smlal v13.4S, v23.4H, v0.4H // .................................................................................*................................ - smlal2 v30.4S, v23.8H, v0.8H // ..................................................................................*............................... - ldr q23, [x5], #32 // .............................................*.................................................................... - smlal2 v22.4S, v17.8H, v0.8H // ...........................................................................................................*...... - uzp2 v15.8H, v13.8H, v30.8H // ......................................................................................*........................... - smlal v12.4S, v17.4H, v0.4H // ............................................................................................................*..... - ldr q17, [x5, #-16] // ................................................*................................................................. - ldr q13, [x1, #16] // ......................................................*........................................................... - uzp2 v27.8H, v23.8H, v17.8H // ....................................................*............................................................. - uzp1 v28.8H, v23.8H, v17.8H // ............................................................................*..................................... - uzp2 v7.8H, v12.8H, v22.8H // ...............................................................................................................*.. - ldr q23, [x1], #32 // ..........................................................................*....................................... - zip1 v5.8H, v7.8H, v15.8H // .................................................................................................................* - ldr q3, [x7, #16] // ........................................................................................*......................... - uzp1 v31.8H, v23.8H, v13.8H // ..............................................................................*................................... - uzp2 v16.8H, v23.8H, v13.8H // ...............................................................................*.................................. - smull2 v24.4S, v31.8H, v21.8H // .......................................................................................*.......................... - ldr q6, [x8, #16] // .........................................................................................*........................ - ldr q23, [x10], #32 // ..........................................................................................*....................... - smlal2 v24.4S, v16.8H, v19.8H // ..........................................................................................................*....... - ldr q17, [x10, #-16] // ...........................................................................................*...................... - ld1 {v22.8H}, [x12], #16 // ............................................................................................*..................... - uzp1 v30.8H, v23.8H, v17.8H // .............................................................................................*.................... - uzp2 v11.8H, v23.8H, v17.8H // ..............................................................................................*................... - ldr q23, [x4], #32 // ...............................................................................................*.................. - ldr q17, [x4, #-16] // ................................................................................................*................. - ldr q4, [x7], #32 // .................................................................................................*................ - uzp1 v20.8H, v23.8H, v17.8H // ..................................................................................................*............... - uzp2 v26.8H, v23.8H, v17.8H // ...................................................................................................*.............. - uzp1 v9.8H, v4.8H, v3.8H // .....................................................................................................*............ - smlal2 v24.4S, v20.8H, v27.8H // ..............................................................................................................*... - ld1 {v8.8H}, [x6], #16 // ....................................................................................................*............. - ldr q25, [x11, #16] // ......................................................................................................*........... - ldr q29, [x11], #32 // .......................................................................................................*.......... - ld1 {v12.8H}, [x9], #16 // ........................................................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // ................................................................................................................*. - ldr q14, [x8], #32 // .........................................................................................................*........ - ld1 {v23.8H}, [x3], #16 // .............................................................................................................*.... - - // ------------------------------------------------- new position --------------------------------------------------> - // 0 25 50 75 100 - // |------------------------|------------------------|------------------------|------------------------|------------- - // ldr q3, [x2], #32 // .*................................................................................................................ - // ldr q17, [x2, #-16] // *................................................................................................................. - // ldr q21, [x5], #32 // ..*............................................................................................................... - // ldr q19, [x5, #-16] // .....*............................................................................................................ - // uzp2 v27.8H, v21.8H, v19.8H // .......*.......................................................................................................... - // ldr q25, [x1, #16] // ......*........................................................................................................... - // ldr q22, [x1], #32 // .........*........................................................................................................ - // uzp1 v28.8H, v21.8H, v19.8H // ........*......................................................................................................... - // uzp1 v31.8H, v22.8H, v25.8H // ...........*...................................................................................................... - // uzp2 v16.8H, v22.8H, v25.8H // ............*..................................................................................................... - // uzp2 v21.8H, v3.8H, v17.8H // ...*.............................................................................................................. - // uzp1 v19.8H, v3.8H, v17.8H // ....*............................................................................................................. - // smull2 v24.4S, v31.8H, v21.8H // .............*.................................................................................................... - // ldr q3, [x7, #16] // ..........*....................................................................................................... - // ldr q6, [x8, #16] // .................................*................................................................................ - // ldr q8, [x10], #32 // ....................................*............................................................................. - // ldr q26, [x10, #-16] // .....................................*............................................................................ - // ld1 {v22.8H}, [x12], #16 // ......................................*........................................................................... - // uzp1 v30.8H, v8.8H, v26.8H // .......................................*.......................................................................... - // uzp2 v11.8H, v8.8H, v26.8H // ........................................*......................................................................... - // ldr q8, [x4], #32 // ...................*.............................................................................................. - // ldr q26, [x4, #-16] // ....................*............................................................................................. - // ldr q4, [x7], #32 // ................................*................................................................................. - // uzp1 v20.8H, v8.8H, v26.8H // ......................*........................................................................................... - // uzp2 v26.8H, v8.8H, v26.8H // .......................*.......................................................................................... - // ld1 {v8.8H}, [x6], #16 // .........................................*........................................................................ - // uzp1 v9.8H, v4.8H, v3.8H // ..................................*............................................................................... - // ldr q25, [x11, #16] // ..........................................*....................................................................... - // ldr q29, [x11], #32 // .............................................*.................................................................... - // ld1 {v12.8H}, [x9], #16 // ..............................................*................................................................... - // ldr q14, [x8], #32 // .................................................*................................................................ - // smlal2 v24.4S, v16.8H, v19.8H // .................*................................................................................................ - // ld1 {v23.8H}, [x3], #16 // .....................*............................................................................................ - // smlal2 v24.4S, v20.8H, v27.8H // ........................*......................................................................................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................*.................................................................. - // smlal2 v24.4S, v26.8H, v28.8H // ............................*..................................................................................... - // uzp2 v4.8H, v4.8H, v3.8H // ...................................*.............................................................................. - // smull2 v13.4S, v31.8H, v19.8H // ...............*.................................................................................................. - // ldr q3, [x2], #32 // ..................................................*............................................................... - // uzp2 v1.8H, v29.8H, v25.8H // ................................................*................................................................. - // smlal2 v13.4S, v16.8H, v23.8H // ..........................*....................................................................................... - // ldr q17, [x2, #-16] // .....................................................................*............................................ - // smull v18.4S, v31.4H, v19.4H // ................*................................................................................................. - // smlal2 v13.4S, v20.8H, v28.8H // ..............................*................................................................................... - // smull v29.4S, v31.4H, v21.4H // ..............*................................................................................................... - // ldr q21, [x5], #32 // ..............................................................................*................................... - // smlal2 v13.4S, v26.8H, v8.8H // ...........................................*...................................................................... - // smlal v29.4S, v16.4H, v19.4H // ..................*............................................................................................... - // ldr q19, [x5, #-16] // ..................................................................................*............................... - // smlal v18.4S, v16.4H, v23.4H // ...........................*...................................................................................... - // smlal v29.4S, v20.4H, v27.4H // .........................*........................................................................................ - // uzp1 v31.8H, v14.8H, v6.8H // ...................................................*.............................................................. - // uzp2 v27.8H, v21.8H, v19.8H // ....................................................................................*............................. - // smlal v18.4S, v20.4H, v28.4H // ...............................*.................................................................................. - // ldr q25, [x1, #16] // ...................................................................................*.............................. - // smlal v29.4S, v26.4H, v28.4H // .............................*.................................................................................... - // smlal v18.4S, v26.4H, v8.4H // ............................................*..................................................................... - // uzp2 v26.8H, v14.8H, v6.8H // ....................................................*............................................................. - // smlal2 v13.4S, v9.8H, v31.8H // .....................................................*............................................................ - // smlal2 v24.4S, v9.8H, v26.8H // ......................................................*........................................................... - // smlal v29.4S, v9.4H, v26.4H // .......................................................*.......................................................... - // smlal v18.4S, v9.4H, v31.4H // ........................................................*......................................................... - // smlal2 v13.4S, v4.8H, v12.8H // .........................................................*........................................................ - // smlal2 v24.4S, v4.8H, v31.8H // ..........................................................*....................................................... - // smlal v29.4S, v4.4H, v31.4H // ...........................................................*...................................................... - // smlal v18.4S, v4.4H, v12.4H // ............................................................*..................................................... - // smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................................................... - // smlal2 v24.4S, v30.8H, v1.8H // ..............................................................*................................................... - // smlal v29.4S, v30.4H, v1.4H // ...............................................................*.................................................. - // smlal v18.4S, v30.4H, v10.4H // ................................................................*................................................. - // smlal2 v13.4S, v11.8H, v22.8H // .................................................................*................................................ - // smlal2 v24.4S, v11.8H, v10.8H // ..................................................................*............................................... - // smlal v29.4S, v11.4H, v10.4H // ...................................................................*.............................................. - // smlal v18.4S, v11.4H, v22.4H // ....................................................................*............................................. - // ldr q22, [x1], #32 // .......................................................................................*.......................... - // uzp1 v31.8H, v29.8H, v24.8H // ......................................................................*........................................... - // uzp1 v28.8H, v21.8H, v19.8H // .....................................................................................*............................ - // mul v19.8H, v31.8H, v2.8H // ........................................................................*......................................... - // uzp1 v31.8H, v22.8H, v25.8H // ..........................................................................................*....................... - // uzp2 v16.8H, v22.8H, v25.8H // ...........................................................................................*...................... - // uzp2 v21.8H, v3.8H, v17.8H // .........................................................................*........................................ - // smlal v29.4S, v19.4H, v0.4H // ............................................................................*..................................... - // smlal2 v24.4S, v19.8H, v0.8H // .............................................................................*.................................... - // uzp1 v19.8H, v3.8H, v17.8H // ..........................................................................*....................................... - // uzp1 v26.8H, v18.8H, v13.8H // .......................................................................*.......................................... - // mul v23.8H, v26.8H, v2.8H // ...........................................................................*...................................... - // uzp2 v15.8H, v29.8H, v24.8H // ................................................................................*................................. - // smull2 v24.4S, v31.8H, v21.8H // ............................................................................................*..................... - // ldr q3, [x7, #16] // .........................................................................................*........................ - // ldr q6, [x8, #16] // .............................................................................................*.................... - // ldr q8, [x10], #32 // ..............................................................................................*................... - // ldr q26, [x10, #-16] // ................................................................................................*................. - // ld1 {v22.8H}, [x12], #16 // .................................................................................................*................ - // uzp1 v30.8H, v8.8H, v26.8H // ..................................................................................................*............... - // uzp2 v11.8H, v8.8H, v26.8H // ...................................................................................................*.............. - // ldr q8, [x4], #32 // ....................................................................................................*............. - // ldr q26, [x4, #-16] // .....................................................................................................*............ - // ldr q4, [x7], #32 // ......................................................................................................*........... - // uzp1 v20.8H, v8.8H, v26.8H // .......................................................................................................*.......... - // uzp2 v26.8H, v8.8H, v26.8H // ........................................................................................................*......... - // ld1 {v8.8H}, [x6], #16 // ...........................................................................................................*...... - // uzp1 v9.8H, v4.8H, v3.8H // .........................................................................................................*........ - // ldr q25, [x11, #16] // ............................................................................................................*..... - // ldr q29, [x11], #32 // .............................................................................................................*.... - // ld1 {v12.8H}, [x9], #16 // ..............................................................................................................*... - // ldr q14, [x8], #32 // ................................................................................................................*. - // smlal2 v24.4S, v16.8H, v19.8H // ...............................................................................................*.................. - // smlal2 v13.4S, v23.8H, v0.8H // ...............................................................................*.................................. - // smlal v18.4S, v23.4H, v0.4H // .................................................................................*................................ - // ld1 {v23.8H}, [x3], #16 // .................................................................................................................* - // smlal2 v24.4S, v20.8H, v27.8H // ..........................................................................................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // ......................................................................................*........................... - // uzp1 v10.8H, v29.8H, v25.8H // ...............................................................................................................*.. - // zip1 v5.8H, v7.8H, v15.8H // ........................................................................................*......................... - - sub count, count, #2 -1: - // Instructions: 82 - // Expected cycles: 102 - // Expected IPC: 0.80 - // - // Cycle bound: 102.0 - // IPC bound: 0.80 - // - // Wall time: 15.93s - // User time: 15.93s - // - // ------------------------------- original position -------------------------------> - // 0 25 50 75 - // |------------------------|------------------------|------------------------|------ - smlal2 v24.4S, v26.8H, v28.8H // .................................*................................................ - uzp2 v4.8H, v4.8H, v3.8H // .....................................*............................................ - smull2 v13.4S, v31.8H, v19.8H // ..........*....................................................................... - ldr q3, [x2], #32 // ....e............................................................................. - uzp2 v1.8H, v29.8H, v25.8H // ..........................................................*....................... - smlal2 v13.4S, v16.8H, v23.8H // ............*..................................................................... - ldr q17, [x2, #-16] // .....e............................................................................ - smull v18.4S, v31.4H, v19.4H // .........*........................................................................ - smlal2 v13.4S, v20.8H, v28.8H // ...........................*...................................................... - smull v29.4S, v31.4H, v21.4H // .............*.................................................................... - ldr q21, [x5], #32 // .....................e............................................................ - smlal2 v13.4S, v26.8H, v8.8H // .............................*.................................................... - smlal v29.4S, v16.4H, v19.4H // ...............*.................................................................. - ldr q19, [x5, #-16] // ......................e........................................................... - smlal v18.4S, v16.4H, v23.4H // ...........*...................................................................... - smlal v29.4S, v20.4H, v27.4H // ..............................*................................................... - uzp1 v31.8H, v14.8H, v6.8H // ........................................*......................................... - uzp2 v27.8H, v21.8H, v19.8H // ........................e......................................................... - smlal v18.4S, v20.4H, v28.4H // ..........................*....................................................... - ldr q25, [x1, #16] // .e................................................................................ - smlal v29.4S, v26.4H, v28.4H // ................................*................................................. - smlal v18.4S, v26.4H, v8.4H // ............................*..................................................... - uzp2 v26.8H, v14.8H, v6.8H // .........................................*........................................ - smlal2 v13.4S, v9.8H, v31.8H // ............................................*..................................... - smlal2 v24.4S, v9.8H, v26.8H // ................................................*................................. - smlal v29.4S, v9.4H, v26.4H // ...............................................*.................................. - smlal v18.4S, v9.4H, v31.4H // ...........................................*...................................... - smlal2 v13.4S, v4.8H, v12.8H // ..............................................*................................... - smlal2 v24.4S, v4.8H, v31.8H // ..................................................*............................... - smlal v29.4S, v4.4H, v31.4H // .................................................*................................ - smlal v18.4S, v4.4H, v12.4H // .............................................*.................................... - smlal2 v13.4S, v30.8H, v10.8H // .............................................................*.................... - smlal2 v24.4S, v30.8H, v1.8H // .................................................................*................ - smlal v29.4S, v30.4H, v1.4H // ................................................................*................. - smlal v18.4S, v30.4H, v10.4H // ............................................................*..................... - smlal2 v13.4S, v11.8H, v22.8H // ...............................................................*.................. - smlal2 v24.4S, v11.8H, v10.8H // ...................................................................*.............. - smlal v29.4S, v11.4H, v10.4H // ..................................................................*............... - smlal v18.4S, v11.4H, v22.4H // ..............................................................*................... - ldr q22, [x1], #32 // e................................................................................. - uzp1 v31.8H, v29.8H, v24.8H // .........................................................................*........ - uzp1 v28.8H, v21.8H, v19.8H // .......................e.......................................................... - mul v19.8H, v31.8H, v2.8H // ..........................................................................*....... - uzp1 v31.8H, v22.8H, v25.8H // ..e............................................................................... - uzp2 v16.8H, v22.8H, v25.8H // ...e.............................................................................. - uzp2 v21.8H, v3.8H, v17.8H // .......e.......................................................................... - smlal v29.4S, v19.4H, v0.4H // ...........................................................................*...... - smlal2 v24.4S, v19.8H, v0.8H // ............................................................................*..... - uzp1 v19.8H, v3.8H, v17.8H // ......e........................................................................... - uzp1 v26.8H, v18.8H, v13.8H // ....................................................................*............. - zip2 v14.8H, v7.8H, v15.8H // ...............................................................................l.. - mul v23.8H, v26.8H, v2.8H // .....................................................................*............ - uzp2 v15.8H, v29.8H, v24.8H // .............................................................................*.... - smull2 v24.4S, v31.8H, v21.8H // ..............e................................................................... - str q14, [x0, #16] // .................................................................................l - ldr q3, [x7, #16] // ...................................e.............................................. - ldr q6, [x8, #16] // .......................................e.......................................... - ldr q8, [x10], #32 // ...................................................e.............................. - ldr q26, [x10, #-16] // ....................................................e............................. - ld1 {v22.8H}, [x12], #16 // ...........................................................e...................... - uzp1 v30.8H, v8.8H, v26.8H // .....................................................e............................ - uzp2 v11.8H, v8.8H, v26.8H // ......................................................e........................... - ldr q8, [x4], #32 // .................e................................................................ - ldr q26, [x4, #-16] // ..................e............................................................... - ldr q4, [x7], #32 // ..................................e............................................... - uzp1 v20.8H, v8.8H, v26.8H // ...................e.............................................................. - uzp2 v26.8H, v8.8H, v26.8H // ....................e............................................................. - ld1 {v8.8H}, [x6], #16 // .........................e........................................................ - uzp1 v9.8H, v4.8H, v3.8H // ....................................e............................................. - ldr q25, [x11, #16] // ........................................................e......................... - ldr q29, [x11], #32 // .......................................................e.......................... - ld1 {v12.8H}, [x9], #16 // ..........................................e....................................... - ldr q14, [x8], #32 // ......................................e........................................... - smlal2 v24.4S, v16.8H, v19.8H // ................e................................................................. - smlal2 v13.4S, v23.8H, v0.8H // .......................................................................*.......... - smlal v18.4S, v23.4H, v0.4H // ......................................................................*........... - ld1 {v23.8H}, [x3], #16 // ........e......................................................................... - smlal2 v24.4S, v20.8H, v27.8H // ...............................e.................................................. - uzp2 v7.8H, v18.8H, v13.8H // ........................................................................*......... - uzp1 v10.8H, v29.8H, v25.8H // .........................................................e........................ - str q5, [x0], #32 // ................................................................................l. - zip1 v5.8H, v7.8H, v15.8H // ..............................................................................*... - - // ----------------------------------------------------------------------------------------------------------------- new position ------------------------------------------------------------------------------------------------------------------> - // 0 25 50 75 100 125 150 175 200 225 - // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|---------------- - // ldr q12, [x1], #32 // ....................................e..........................................'......................................~..........................................'......................................~......................................... - // ldr q13, [x1, #-16] // ................e..............................................................'..................~..............................................................'..................~............................................................. - // uzp1 v3.8h, v12.8h, v13.8h // ........................................e......................................'..........................................~......................................'..........................................~..................................... - // uzp2 v4.8h, v12.8h, v13.8h // .........................................e.....................................'...........................................~.....................................'...........................................~.................................... - // ldr q12, [x2], #32 // e..............................................................................'..~..............................................................................'..~............................................................................. - // ldr q13, [x2, #-16] // ...e...........................................................................'.....~...........................................................................'.....~.......................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // .............................................e.................................'...............................................~.................................'...............................................~................................ - // uzp2 v6.8h, v12.8h, v13.8h // ..........................................e....................................'............................................~....................................'............................................~................................... - // ld1 {v7.8h}, [x3], #16 // .........................................................................e.....'...........................................................................~.....'...........................................................................~.... - // smull v8.4s, v3.4h, v5.4h // ....~..........................................................................'......*..........................................................................'......~......................................................................... - // smull2 v10.4s, v3.8h, v5.8h // ...............................................................................'.*...............................................................................'.~.............................................................................. - // smlal v8.4s, v4.4h, v7.4h // ...........~...................................................................'.............*...................................................................'.............~.................................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ..~............................................................................'....*............................................................................'....~........................................................................... - // smull v9.4s, v3.4h, v6.4h // ......~........................................................................'........*........................................................................'........~....................................................................... - // smull2 v11.4s, v3.8h, v6.8h // ..................................................e............................'....................................................~............................'....................................................~........................... - // smlal v9.4s, v4.4h, v5.4h // .........~.....................................................................'...........*.....................................................................'...........~.................................................................... - // smlal2 v11.4s, v4.8h, v5.8h // ......................................................................e........'........................................................................~........'........................................................................~....... - // ldr q12, [x4], #32 // ...........................................................e...................'.............................................................~...................'.............................................................~.................. - // ldr q13, [x4, #-16] // ............................................................e..................'..............................................................~..................'..............................................................~................. - // uzp1 v3.8h, v12.8h, v13.8h // ..............................................................e................'................................................................~................'................................................................~............... - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................e...............'.................................................................~...............'.................................................................~.............. - // ldr q12, [x5], #32 // .......e.......................................................................'.........~.......................................................................'.........~...................................................................... - // ldr q13, [x5, #-16] // ..........e....................................................................'............~....................................................................'............~................................................................... - // uzp1 v5.8h, v12.8h, v13.8h // ......................................e........................................'........................................~........................................'........................................~....................................... - // uzp2 v6.8h, v12.8h, v13.8h // ..............e................................................................'................~................................................................'................~............................................................... - // ld1 {v7.8h}, [x6], #16 // ................................................................e..............'..................................................................~..............'..................................................................~............. - // smlal v8.4s, v3.4h, v5.4h // ...............~...............................................................'.................*...............................................................'.................~.............................................................. - // smlal2 v10.4s, v3.8h, v5.8h // .....~.........................................................................'.......*.........................................................................'.......~........................................................................ - // smlal v8.4s, v4.4h, v7.4h // ..................~............................................................'....................*............................................................'....................~........................................................... - // smlal2 v10.4s, v4.8h, v7.8h // ........~......................................................................'..........*......................................................................'..........~..................................................................... - // smlal v9.4s, v3.4h, v6.4h // ............~..................................................................'..............*..................................................................'..............~................................................................. - // smlal2 v11.4s, v3.8h, v6.8h // ..........................................................................e....'............................................................................~....'............................................................................~... - // smlal v9.4s, v4.4h, v5.4h // .................~.............................................................'...................*.............................................................'...................~............................................................ - // smlal2 v11.4s, v4.8h, v5.8h // ...............................................................................*.................................................................................~................................................................................ - // ldr q12, [x7], #32 // .............................................................e.................'...............................................................~.................'...............................................................~................ - // ldr q13, [x7, #-16] // ....................................................e..........................'......................................................~..........................'......................................................~......................... - // uzp1 v3.8h, v12.8h, v13.8h // .................................................................e.............'...................................................................~.............'...................................................................~............ - // uzp2 v4.8h, v12.8h, v13.8h // ...............................................................................'*................................................................................'~............................................................................... - // ldr q12, [x8], #32 // .....................................................................e.........'.......................................................................~.........'.......................................................................~........ - // ldr q13, [x8, #-16] // .....................................................e.........................'.......................................................~.........................'.......................................................~........................ - // uzp1 v5.8h, v12.8h, v13.8h // .............~.................................................................'...............*.................................................................'...............~................................................................ - // uzp2 v6.8h, v12.8h, v13.8h // ...................~...........................................................'.....................*...........................................................'.....................~.......................................................... - // ld1 {v7.8h}, [x9], #16 // ....................................................................e..........'......................................................................~..........'......................................................................~......... - // smlal v8.4s, v3.4h, v5.4h // .......................~.......................................................'.........................*.......................................................'.........................~...................................................... - // smlal2 v10.4s, v3.8h, v5.8h // ....................~..........................................................'......................*..........................................................'......................~......................................................... - // smlal v8.4s, v4.4h, v7.4h // ...........................~...................................................'.............................*...................................................'.............................~.................................................. - // smlal2 v10.4s, v4.8h, v7.8h // ........................~......................................................'..........................*......................................................'..........................~..................................................... - // smlal v9.4s, v3.4h, v6.4h // ......................~........................................................'........................*........................................................'........................~....................................................... - // smlal2 v11.4s, v3.8h, v6.8h // .....................~.........................................................'.......................*.........................................................'.......................~........................................................ - // smlal v9.4s, v4.4h, v5.4h // ..........................~....................................................'............................*....................................................'............................~................................................... - // smlal2 v11.4s, v4.8h, v5.8h // .........................~.....................................................'...........................*.....................................................'...........................~.................................................... - // ldr q12, [x10], #32 // ......................................................e........................'........................................................~........................'........................................................~....................... - // ldr q13, [x10, #-16] // .......................................................e.......................'.........................................................~.......................'.........................................................~...................... - // uzp1 v3.8h, v12.8h, v13.8h // .........................................................e.....................'...........................................................~.....................'...........................................................~.................... - // uzp2 v4.8h, v12.8h, v13.8h // ..........................................................e....................'............................................................~....................'............................................................~................... - // ldr q12, [x11], #32 // ...................................................................e...........'.....................................................................~...........'.....................................................................~.......... - // ldr q13, [x11, #-16] // ..................................................................e............'....................................................................~............'....................................................................~........... - // uzp1 v5.8h, v12.8h, v13.8h // ............................................................................e..'..............................................................................~..'..............................................................................~. - // uzp2 v6.8h, v12.8h, v13.8h // .~.............................................................................'...*.............................................................................'...~............................................................................ - // ld1 {v7.8h}, [x12], #16 // ........................................................e......................'..........................................................~......................'..........................................................~..................... - // smlal v8.4s, v3.4h, v5.4h // ...............................~...............................................'.................................*...............................................'.................................~.............................................. - // smlal2 v10.4s, v3.8h, v5.8h // ............................~..................................................'..............................*..................................................'..............................~................................................. - // smlal v8.4s, v4.4h, v7.4h // ...................................~...........................................'.....................................*...........................................'.....................................~.......................................... - // smlal2 v10.4s, v4.8h, v7.8h // ................................~..............................................'..................................*..............................................'..................................~............................................. - // smlal v9.4s, v3.4h, v6.4h // ..............................~................................................'................................*................................................'................................~............................................... - // smlal2 v11.4s, v3.8h, v6.8h // .............................~.................................................'...............................*.................................................'...............................~................................................ - // smlal v9.4s, v4.4h, v5.4h // ..................................~............................................'....................................*............................................'....................................~........................................... - // smlal2 v11.4s, v4.8h, v5.8h // .................................~.............................................'...................................*.............................................'...................................~............................................ - // uzp1 v28.8h, v8.8h, v10.8h // ..............................................~................................'................................................*................................'................................................~............................... - // mul v28.8h, v28.8h, v2.8h // ................................................~..............................'..................................................*..............................'..................................................~............................. - // smlal v8.4s, v28.4h, v0.4h // ........................................................................~......'..........................................................................*......'..........................................................................~..... - // smlal2 v10.4s, v28.8h, v0.8h // .......................................................................~.......'.........................................................................*.......'.........................................................................~...... - // uzp2 v26.8h, v8.8h, v10.8h // ...........................................................................~...'.............................................................................*...'.............................................................................~.. - // uzp1 v28.8h, v9.8h, v11.8h // .....................................~.........................................'.......................................*.........................................'.......................................~........................................ - // mul v28.8h, v28.8h, v2.8h // .......................................~.......................................'.........................................*.......................................'.........................................~...................................... - // smlal v9.4s, v28.4h, v0.4h // ...........................................~...................................'.............................................*...................................'.............................................~.................................. - // smlal2 v11.4s, v28.8h, v0.8h // ............................................~..................................'..............................................*..................................'..............................................~................................. - // uzp2 v27.8h, v9.8h, v11.8h // .................................................~.............................'...................................................*.............................'...................................................~............................ - // zip1 v12.8h, v26.8h, v27.8h // ..............................................................................~'................................................................................*'................................................................................ - // zip2 v13.8h, v26.8h, v27.8h // ...............................................~...............................'.................................................~...............................'.................................................l.............................. - // str q12, [x0], #32 // .............................................................................~.'...............................................................................~.'...............................................................................l - // str q13, [x0, #-16] // ...................................................~...........................'.....................................................~...........................'.....................................................l.......................... - - sub count, count, #1 - cbnz count, 1b - // Instructions: 50 - // Expected cycles: 56 - // Expected IPC: 0.89 - // - // Cycle bound: 56.0 - // IPC bound: 0.89 - // - // Wall time: 4.16s - // User time: 4.16s - // - // --------------- original position ---------------> - // 0 25 - // |------------------------| - smull2 v17.4S, v31.8H, v19.8H // ..*............................................... - uzp2 v1.8H, v14.8H, v6.8H // ................*................................. - smull v18.4S, v31.4H, v21.4H // .......*.......................................... - smlal2 v24.4S, v26.8H, v28.8H // *................................................. - smlal2 v17.4S, v16.8H, v23.8H // ....*............................................. - smull v21.4S, v31.4H, v19.4H // .....*............................................ - smlal v18.4S, v16.4H, v19.4H // .........*........................................ - uzp2 v31.8H, v4.8H, v3.8H // .*................................................ - uzp1 v3.8H, v14.8H, v6.8H // ............*..................................... - smlal v21.4S, v16.4H, v23.4H // ..........*....................................... - smlal v18.4S, v20.4H, v27.4H // ...........*...................................... - uzp2 v14.8H, v29.8H, v25.8H // ...*.............................................. - smlal2 v17.4S, v20.8H, v28.8H // ......*........................................... - smlal v21.4S, v20.4H, v28.4H // .............*.................................... - smlal v18.4S, v26.4H, v28.4H // ..............*................................... - smlal2 v24.4S, v9.8H, v1.8H // ..................*............................... - smlal2 v17.4S, v26.8H, v8.8H // ........*......................................... - smlal v21.4S, v26.4H, v8.4H // ...............*.................................. - smlal v18.4S, v9.4H, v1.4H // ...................*.............................. - smlal2 v24.4S, v31.8H, v3.8H // ......................*........................... - smlal2 v17.4S, v9.8H, v3.8H // .................*................................ - smlal v21.4S, v9.4H, v3.4H // ....................*............................. - smlal v18.4S, v31.4H, v3.4H // .......................*.......................... - smlal2 v24.4S, v30.8H, v14.8H // ..........................*....................... - smlal2 v17.4S, v31.8H, v12.8H // .....................*............................ - smlal v21.4S, v31.4H, v12.4H // ........................*......................... - smlal v18.4S, v30.4H, v14.4H // ...........................*...................... - smlal2 v24.4S, v11.8H, v10.8H // ..............................*................... - smlal2 v17.4S, v30.8H, v10.8H // .........................*........................ - smlal v21.4S, v30.4H, v10.4H // ............................*..................... - smlal v18.4S, v11.4H, v10.4H // ...............................*.................. - zip2 v19.8H, v7.8H, v15.8H // ......................................*........... - smlal2 v17.4S, v11.8H, v22.8H // .............................*.................... - smlal v21.4S, v11.4H, v22.4H // ................................*................. - uzp1 v23.8H, v18.8H, v24.8H // .................................*................ - str q19, [x0, #16] // .........................................*........ - mul v19.8H, v23.8H, v2.8H // ..................................*............... - uzp1 v23.8H, v21.8H, v17.8H // .....................................*............ - str q5, [x0], #32 // .............................................*.... - mul v26.8H, v23.8H, v2.8H // .......................................*.......... - smlal v18.4S, v19.4H, v0.4H // ...................................*.............. - smlal2 v24.4S, v19.8H, v0.8H // ....................................*............. - smlal v21.4S, v26.4H, v0.4H // ...........................................*...... - smlal2 v17.4S, v26.8H, v0.8H // ..........................................*....... - uzp2 v13.8H, v18.8H, v24.8H // ........................................*......... - uzp2 v19.8H, v21.8H, v17.8H // ............................................*..... - zip1 v23.8H, v19.8H, v13.8H // ..............................................*... - zip2 v19.8H, v19.8H, v13.8H // ...............................................*.. - str q23, [x0], #32 // .................................................* - str q19, [x0, #-16] // ................................................*. - - // ----------------- new position ------------------> - // 0 25 - // |------------------------|------------------------ - // smlal2 v24.4S, v26.8H, v28.8H // ...*.............................................. - // uzp2 v4.8H, v4.8H, v3.8H // .......*.......................................... - // smull2 v13.4S, v31.8H, v19.8H // *................................................. - // uzp2 v1.8H, v29.8H, v25.8H // ...........*...................................... - // smlal2 v13.4S, v16.8H, v23.8H // ....*............................................. - // smull v18.4S, v31.4H, v19.4H // .....*............................................ - // smlal2 v13.4S, v20.8H, v28.8H // ............*..................................... - // smull v29.4S, v31.4H, v21.4H // ..*............................................... - // smlal2 v13.4S, v26.8H, v8.8H // ................*................................. - // smlal v29.4S, v16.4H, v19.4H // ......*........................................... - // smlal v18.4S, v16.4H, v23.4H // .........*........................................ - // smlal v29.4S, v20.4H, v27.4H // ..........*....................................... - // uzp1 v31.8H, v14.8H, v6.8H // ........*......................................... - // smlal v18.4S, v20.4H, v28.4H // .............*.................................... - // smlal v29.4S, v26.4H, v28.4H // ..............*................................... - // smlal v18.4S, v26.4H, v8.4H // .................*................................ - // uzp2 v26.8H, v14.8H, v6.8H // .*................................................ - // smlal2 v13.4S, v9.8H, v31.8H // ....................*............................. - // smlal2 v24.4S, v9.8H, v26.8H // ...............*.................................. - // smlal v29.4S, v9.4H, v26.4H // ..................*............................... - // smlal v18.4S, v9.4H, v31.4H // .....................*............................ - // smlal2 v13.4S, v4.8H, v12.8H // ........................*......................... - // smlal2 v24.4S, v4.8H, v31.8H // ...................*.............................. - // smlal v29.4S, v4.4H, v31.4H // ......................*........................... - // smlal v18.4S, v4.4H, v12.4H // .........................*........................ - // smlal2 v13.4S, v30.8H, v10.8H // ............................*..................... - // smlal2 v24.4S, v30.8H, v1.8H // .......................*.......................... - // smlal v29.4S, v30.4H, v1.4H // ..........................*....................... - // smlal v18.4S, v30.4H, v10.4H // .............................*.................... - // smlal2 v13.4S, v11.8H, v22.8H // ................................*................. - // smlal2 v24.4S, v11.8H, v10.8H // ...........................*...................... - // smlal v29.4S, v11.4H, v10.4H // ..............................*................... - // smlal v18.4S, v11.4H, v22.4H // .................................*................ - // uzp1 v31.8H, v29.8H, v24.8H // ..................................*............... - // mul v19.8H, v31.8H, v2.8H // ....................................*............. - // smlal v29.4S, v19.4H, v0.4H // ........................................*......... - // smlal2 v24.4S, v19.8H, v0.8H // .........................................*........ - // uzp1 v26.8H, v18.8H, v13.8H // .....................................*............ - // zip2 v14.8H, v7.8H, v15.8H // ...............................*.................. - // mul v23.8H, v26.8H, v2.8H // .......................................*.......... - // uzp2 v15.8H, v29.8H, v24.8H // ............................................*..... - // str q14, [x0, #16] // ...................................*.............. - // smlal2 v13.4S, v23.8H, v0.8H // ...........................................*...... - // smlal v18.4S, v23.4H, v0.4H // ..........................................*....... - // uzp2 v7.8H, v18.8H, v13.8H // .............................................*.... - // str q5, [x0], #32 // ......................................*........... - // zip1 v5.8H, v7.8H, v15.8H // ..............................................*... - // zip2 v14.8H, v7.8H, v15.8H // ...............................................*.. - // str q14, [x0, #16] // .................................................* - // str q5, [x0], #32 // ................................................*. - - - pop_stack - ret -#endif /* MLKEM_K == 4 */ - -/****************** REGISTER DEALLOCATIONS *******************/ - .unreq out - .unreq a0_ptr - .unreq b0_ptr - .unreq b0_cache_ptr - .unreq a1_ptr - .unreq b1_ptr - .unreq b1_cache_ptr - .unreq a2_ptr - .unreq b2_ptr - .unreq b2_cache_ptr - .unreq a3_ptr - .unreq b3_ptr - .unreq b3_cache_ptr - .unreq count - .unreq modulus - .unreq modulus_twisted - .unreq wtmp - .unreq aa0 - .unreq aa1 - .unreq bb0 - .unreq bb1 - .unreq bb1t - .unreq res0l - .unreq res1l - .unreq res0h - .unreq res1h - .unreq tmp0 - .unreq tmp1 - .unreq q_tmp0 - .unreq q_tmp1 - .unreq out0 - .unreq out1 - .unreq t0 - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S index 8302d2a3e..f2451815a 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_asm_clean.S @@ -19,8 +19,8 @@ * Returns number of sampled 16-bit integers (at most MLKEM_N). **************************************************/ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +/* simpasm: header-end */ // We save the output on the stack first, and copy to the actual // output buffer only in the end. This is because the main loop can overwrite @@ -112,9 +112,9 @@ mlkem_q .req v30 bits .req v31 -.text -.global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) -.balign 4 + .text + .global MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean) + .balign 4 MLKEM_ASM_NAMESPACE(rej_uniform_asm_clean): push_stack @@ -402,5 +402,5 @@ return: .unreq mlkem_q .unreq bits -#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ +/* simpasm: footer-start */ +#endif /* defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c index becdf303b..592c15fb0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/aarch64/src/rej_uniform_table.c @@ -10,8 +10,7 @@ #include "../../../common.h" -#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_CLEAN) || \ - defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) +#if defined(MLKEM_NATIVE_ARITH_BACKEND_AARCH64_OPT) #include #include "arith_native_aarch64.h" diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_aarch64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_ref/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h index f9fe4310a..df43dc5b0 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/default.h @@ -11,21 +11,10 @@ #include "../sys.h" #ifdef SYS_AARCH64 -/* - * For AArch64, we currently we have one clean and one opt profile. - * We default to the opt profile. - * - * In the future, this may branch further depending on the microarchitecture. - */ #include "aarch64/opt.h" #endif /* SYS_AARCH64 */ #ifdef SYS_X86_64_AVX2 -/* - * For now, there's only one x86_64 profile, based on - * the AVX2 code from the Kyber repository. - * https://github.com/pq-crystals/kyber - */ #include "x86_64/default.h" #endif /* SYS_X86_64 */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S index 5fdc3d0a0..3063d20ae 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/basemul.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" @@ -113,6 +114,7 @@ vmovdqa %ymm11,(64*\off+48)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(basemul_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(basemul_avx2): mov %rsp,%r8 and $-32,%rsp @@ -133,4 +135,5 @@ schoolbook 3 mov %r8,%rsp ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S index 7b1f22624..e74199930 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/intt.S @@ -12,6 +12,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -242,6 +243,7 @@ vmovdqa %ymm11,(64*\off+176)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(invntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(invntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -252,4 +254,5 @@ intt_level6 0 intt_level6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S index 5d928b4cc..70582fbc1 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntt.S @@ -8,6 +8,7 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ #include "consts.h" #include "shuffle.inc" @@ -205,6 +206,7 @@ vmovdqa %ymm11,(128*\off+112)*2(%rdi) .text .global MLKEM_ASM_NAMESPACE(ntt_avx2) +.balign 4 MLKEM_ASM_NAMESPACE(ntt_avx2): vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 @@ -216,4 +218,5 @@ levels1t6 1 ret +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S new file mode 100644 index 000000000..71f2af000 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttfrombytes.S @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S new file mode 100644 index 000000000..4c10ef366 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttpack.S @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttpack_avx2): +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S new file mode 100644 index 000000000..4f0b01e83 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/ntttobytes.S @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(ntttobytes_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S new file mode 100644 index 000000000..0cf45c671 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/nttunpack.S @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation from Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" +#include "shuffle.inc" + +.text +.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(nttunpack_avx2): +call nttunpack128_avx2 +add $256,%rdi +call nttunpack128_avx2 +ret + +nttunpack128_avx2: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S new file mode 100644 index 000000000..78bad0559 --- /dev/null +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/reduce.S @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 + */ + +// Implementation based on Kyber reference repository +// https://github.com/pq-crystals/kyber/blob/main/avx2 + +// Changes: +// - Add call to csub in reduce128_avx to produce outputs +// in [0,1,...,q-1] rather than [0,1,...,q], matching the +// semantics of poly_reduce(). + +#include "../../../common.h" + +#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) +/* simpasm: header-end */ + +#include "consts.h" +#include "fq.inc" + +.text +.global MLKEM_ASM_NAMESPACE(reduce_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(reduce_avx2): +#consts +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 +call reduce128_avx2 +add $256,%rdi +call reduce128_avx2 +ret + +reduce128_avx2: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2 +red16 3 +red16 4 +red16 5 +red16 6 +red16 7 +red16 8 +red16 9 + +csubq 2 +csubq 3 +csubq 4 +csubq 5 +csubq 6 +csubq 7 +csubq 8 +csubq 9 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +/* simpasm: footer-start */ +#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S deleted file mode 100644 index 9bcd04896..000000000 --- a/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/shuffle.S +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2024 The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 - */ - -// Implementation from Kyber reference repository -// https://github.com/pq-crystals/kyber/blob/main/avx2 - -#include "../../../common.h" - -#if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) - -#include "consts.h" -#include "fq.inc" -#include "shuffle.inc" - -.global MLKEM_ASM_NAMESPACE(nttpack_avx2) -MLKEM_ASM_NAMESPACE(nttpack_avx2): -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -nttunpack128_avx2: -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttunpack_avx2) -MLKEM_ASM_NAMESPACE(nttunpack_avx2): -call nttunpack128_avx2 -add $256,%rdi -call nttunpack128_avx2 -ret - -ntttobytes128_avx: -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(ntttobytes_avx2) -MLKEM_ASM_NAMESPACE(ntttobytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rdx),%ymm0 -call ntttobytes128_avx -add $256,%rsi -add $192,%rdi -call ntttobytes128_avx -ret - -nttfrombytes128_avx: -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(nttfrombytes_avx2) -MLKEM_ASM_NAMESPACE(nttfrombytes_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMASK*2(%rdx),%ymm0 -call nttfrombytes128_avx -add $256,%rdi -add $192,%rsi -call nttfrombytes128_avx -ret - -#endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */ diff --git a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S similarity index 64% rename from src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S rename to src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S index 3f013a5fa..7774cec0b 100644 --- a/src/kem/ml_kem/mlkem-native_ml-kem-1024_x86_64/native/x86_64/src/fq.S +++ b/src/kem/ml_kem/mlkem-native_ml-kem-768_x86_64/native/x86_64/src/tomont.S @@ -14,63 +14,24 @@ #include "../../../common.h" #if defined(MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT) -#include "consts.h" +/* simpasm: header-end */ +#include "consts.h" #include "fq.inc" .text -reduce128_avx2: -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 -red16 3 -red16 4 -red16 5 -red16 6 -red16 7 -red16 8 -red16 9 - -csubq 2 -csubq 3 -csubq 4 -csubq 5 -csubq 6 -csubq 7 -csubq 8 -csubq 9 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global MLKEM_ASM_NAMESPACE(reduce_avx2) -MLKEM_ASM_NAMESPACE(reduce_avx2): +.global MLKEM_ASM_NAMESPACE(tomont_avx2) +.balign 4 +MLKEM_ASM_NAMESPACE(tomont_avx2): #consts vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XV*2(%rsi),%ymm1 -call reduce128_avx2 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx2 add $256,%rdi -call reduce128_avx2 +call tomont128_avx2 ret - tomont128_avx2: #load vmovdqa (%rdi),%ymm3 @@ -103,15 +64,5 @@ vmovdqa %ymm10,224(%rdi) ret -.global MLKEM_ASM_NAMESPACE(tomont_avx2) -MLKEM_ASM_NAMESPACE(tomont_avx2): -#consts -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XQ*2(%rsi),%ymm0 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO*2(%rsi),%ymm1 -vmovdqa AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI*2(%rsi),%ymm2 -call tomont128_avx2 -add $256,%rdi -call tomont128_avx2 -ret - +/* simpasm: footer-start */ #endif /* MLKEM_NATIVE_ARITH_BACKEND_X86_64_DEFAULT */